From d2fd6a7d76ec90999d502ddd34c2a844af6848db Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Tue, 11 Jun 2024 16:23:54 -0700 Subject: [PATCH 01/60] Initial MLI schemas and MessageHandler class (#607) This PR adds the Capnproto schemas and initial MessageHandler class and tests. --- doc/changelog.md | 8 + setup.py | 1 + smartsim/_core/mli/__init__.py | 0 smartsim/_core/mli/message_handler.py | 535 +++++++++++ .../mli_schemas/data/data_references.capnp | 35 + .../mli_schemas/data/data_references_capnp.py | 15 + .../data/data_references_capnp.pyi | 79 ++ .../mli/mli_schemas/request/request.capnp | 61 ++ .../request_attributes.capnp | 49 + .../request_attributes_capnp.py | 15 + .../request_attributes_capnp.pyi | 83 ++ .../mli/mli_schemas/request/request_capnp.py | 15 + .../mli/mli_schemas/request/request_capnp.pyi | 286 ++++++ .../mli/mli_schemas/response/response.capnp | 51 + .../response_attributes.capnp | 33 + .../response_attributes_capnp.py | 15 + .../response_attributes_capnp.pyi | 77 ++ .../mli_schemas/response/response_capnp.py | 12 + .../mli_schemas/response/response_capnp.pyi | 178 ++++ .../_core/mli/mli_schemas/tensor/tensor.capnp | 80 ++ .../mli/mli_schemas/tensor/tensor_capnp.py | 18 + .../mli/mli_schemas/tensor/tensor_capnp.pyi | 159 +++ tests/test_message_handler/__init__.py | 0 .../test_build_model_key.py | 44 + .../test_build_request_attributes.py | 55 ++ .../test_message_handler/test_build_tensor.py | 185 ++++ .../test_build_tensor_key.py | 44 + .../test_output_descriptor.py | 77 ++ tests/test_message_handler/test_request.py | 906 ++++++++++++++++++ tests/test_message_handler/test_response.py | 341 +++++++ 30 files changed, 3457 insertions(+) create mode 100644 smartsim/_core/mli/__init__.py create mode 100644 smartsim/_core/mli/message_handler.py create mode 100644 smartsim/_core/mli/mli_schemas/data/data_references.capnp create mode 100644 smartsim/_core/mli/mli_schemas/data/data_references_capnp.py create mode 100644 smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi create mode 100644 smartsim/_core/mli/mli_schemas/request/request.capnp create mode 100644 smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes.capnp create mode 100644 smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.py create mode 100644 smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.pyi create mode 100644 smartsim/_core/mli/mli_schemas/request/request_capnp.py create mode 100644 smartsim/_core/mli/mli_schemas/request/request_capnp.pyi create mode 100644 smartsim/_core/mli/mli_schemas/response/response.capnp create mode 100644 smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes.capnp create mode 100644 smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.py create mode 100644 smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.pyi create mode 100644 smartsim/_core/mli/mli_schemas/response/response_capnp.py create mode 100644 smartsim/_core/mli/mli_schemas/response/response_capnp.pyi create mode 100644 smartsim/_core/mli/mli_schemas/tensor/tensor.capnp create mode 100644 smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py create mode 100644 smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi create mode 100644 tests/test_message_handler/__init__.py create mode 100644 tests/test_message_handler/test_build_model_key.py create mode 100644 tests/test_message_handler/test_build_request_attributes.py create mode 100644 tests/test_message_handler/test_build_tensor.py create mode 100644 tests/test_message_handler/test_build_tensor_key.py create mode 100644 tests/test_message_handler/test_output_descriptor.py create mode 100644 tests/test_message_handler/test_request.py create mode 100644 tests/test_message_handler/test_response.py diff --git a/doc/changelog.md b/doc/changelog.md index 1f201f3a8f..78d06663b5 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -9,6 +9,14 @@ Jump to: ## SmartSim +### MLI branch + +Description + +- Added schemas and MessageHandler class for de/serialization of + inference requests and response messages + + ### Development branch To be released at some future point in time diff --git a/setup.py b/setup.py index 96f98bc2cb..55a917e9c6 100644 --- a/setup.py +++ b/setup.py @@ -176,6 +176,7 @@ def has_ext_modules(_placeholder): "protobuf~=3.20", "jinja2>=3.1.2", "watchdog>=4.0.0", + "pycapnp==2.0.0", "pydantic==1.10.14", "pyzmq>=25.1.2", "pygithub>=2.3.0", diff --git a/smartsim/_core/mli/__init__.py b/smartsim/_core/mli/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py new file mode 100644 index 0000000000..733fa83d98 --- /dev/null +++ b/smartsim/_core/mli/message_handler.py @@ -0,0 +1,535 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import typing as t + +import numpy as np + +from .mli_schemas.data import data_references_capnp +from .mli_schemas.request import request_capnp +from .mli_schemas.request.request_attributes import request_attributes_capnp +from .mli_schemas.response import response_capnp +from .mli_schemas.response.response_attributes import response_attributes_capnp +from .mli_schemas.tensor import tensor_capnp + + +class MessageHandler: + @staticmethod + def build_tensor( + tensor: np.ndarray[t.Any, np.dtype[t.Any]], + order: "tensor_capnp.Order", + data_type: "tensor_capnp.NumericalType", + dimensions: t.List[int], + ) -> tensor_capnp.Tensor: + """ + Builds a Tensor message using the provided data, + order, data type, and dimensions. + + :param tensor: Tensor to build the message around + :param order: Order of the tensor, such as row-major (c) or column-major (f) + :param data_type: Data type of the tensor + :param dimensions: Dimensions of the tensor + :raises ValueError: if building fails + """ + try: + description = tensor_capnp.TensorDescriptor.new_message() + description.order = order + description.dataType = data_type + description.dimensions = dimensions + built_tensor = tensor_capnp.Tensor.new_message() + built_tensor.blob = tensor.tobytes() # tensor channel instead? + built_tensor.tensorDescriptor = description + except Exception as e: + raise ValueError( + "Error building tensor." + ) from e # TODO: create custom exception + + return built_tensor + + @staticmethod + def build_output_tensor_descriptor( + order: "tensor_capnp.Order", + keys: t.List["data_references_capnp.TensorKey"], + data_type: "tensor_capnp.ReturnNumericalType", + dimensions: t.List[int], + ) -> tensor_capnp.OutputDescriptor: + """ + Builds an OutputDescriptor message using the provided + order, data type, and dimensions. + + :param order: Order of the tensor, such as row-major (c) or column-major (f) + :param keys: List of TensorKeys to apply transorm descriptor to + :param data_type: Tranform data type of the tensor + :param dimensions: Transform dimensions of the tensor + :raises ValueError: if building fails + """ + try: + description = tensor_capnp.OutputDescriptor.new_message() + description.order = order + description.optionalKeys = keys + description.optionalDatatype = data_type + description.optionalDimension = dimensions + + except Exception as e: + raise ValueError("Error building output tensor descriptor.") from e + + return description + + @staticmethod + def build_tensor_key(key: str) -> data_references_capnp.TensorKey: + """ + Builds a new TensorKey message with the provided key. + + :param key: String to set the TensorKey + :raises ValueError: if building fails + """ + try: + tensor_key = data_references_capnp.TensorKey.new_message() + tensor_key.key = key + except Exception as e: + raise ValueError("Error building tensor key.") from e + return tensor_key + + @staticmethod + def build_model_key(key: str) -> data_references_capnp.ModelKey: + """ + Builds a new ModelKey message with the provided key. + + :param key: String to set the ModelKey + :raises ValueError: if building fails + """ + try: + model_key = data_references_capnp.ModelKey.new_message() + model_key.key = key + except Exception as e: + raise ValueError("Error building model key.") from e + return model_key + + @staticmethod + def build_torch_request_attributes( + tensor_type: "request_attributes_capnp.TorchTensorType", + ) -> request_attributes_capnp.TorchRequestAttributes: + """ + Builds a new TorchRequestAttributes message with the provided tensor type. + + :param tensor_type: Type of the tensor passed in + :raises ValueError: if building fails + """ + try: + attributes = request_attributes_capnp.TorchRequestAttributes.new_message() + attributes.tensorType = tensor_type + except Exception as e: + raise ValueError("Error building Torch request attributes.") from e + return attributes + + @staticmethod + def build_tf_request_attributes( + name: str, tensor_type: "request_attributes_capnp.TFTensorType" + ) -> request_attributes_capnp.TensorFlowRequestAttributes: + """ + Builds a new TensorFlowRequestAttributes message with + the provided name and tensor type. + + :param name: Name of the tensor + :param tensor_type: Type of the tensor passed in + :raises ValueError: if building fails + """ + try: + attributes = ( + request_attributes_capnp.TensorFlowRequestAttributes.new_message() + ) + attributes.name = name + attributes.tensorType = tensor_type + except Exception as e: + raise ValueError("Error building TensorFlow request attributes.") from e + return attributes + + @staticmethod + def build_torch_response_attributes() -> ( + response_attributes_capnp.TorchResponseAttributes + ): + """ + Builds a new TorchResponseAttributes message. + """ + return response_attributes_capnp.TorchResponseAttributes.new_message() + + @staticmethod + def build_tf_response_attributes() -> ( + response_attributes_capnp.TensorFlowResponseAttributes + ): + """ + Builds a new TensorFlowResponseAttributes message. + """ + return response_attributes_capnp.TensorFlowResponseAttributes.new_message() + + @staticmethod + def _assign_model( + request: request_capnp.Request, + model: t.Union[data_references_capnp.ModelKey, t.ByteString], + ) -> None: + """ + Assigns a model to the supplied request. + + :param request: Request being built + :param model: Model to be assigned + :raises ValueError: if building fails + """ + try: + if isinstance(model, bytes): + request.model.modelData = model + else: + request.model.modelKey = model # type: ignore + except Exception as e: + raise ValueError("Error building model portion of request.") from e + + @staticmethod + def _assign_reply_channel( + request: request_capnp.Request, reply_channel: t.ByteString + ) -> None: + """ + Assigns a reply channel to the supplied request. + + :param request: Request being built + :param reply_channel: Reply channel to be assigned + :raises ValueError: if building fails + """ + try: + request.replyChannel.reply = reply_channel + except Exception as e: + raise ValueError("Error building reply channel portion of request.") from e + + @staticmethod + def _assign_device( + request: request_capnp.Request, device: "request_capnp.Device" + ) -> None: + """ + Assigns a device to the supplied request. + + :param request: Request being built + :param device: Device to be assigned + :raises ValueError: if building fails + """ + try: + request.device = device + except Exception as e: + raise ValueError("Error building device portion of request.") from e + + @staticmethod + def _assign_inputs( + request: request_capnp.Request, + inputs: t.Union[ + t.List[data_references_capnp.TensorKey], t.List[tensor_capnp.Tensor] + ], + ) -> None: + """ + Assigns inputs to the supplied request. + + :param request: Request being built + :param inputs: Inputs to be assigned + :raises ValueError: if building fails + """ + try: + if inputs: + display_name = inputs[0].schema.node.displayName # type: ignore + input_class_name = display_name.split(":")[-1] + if input_class_name == "Tensor": + request.input.inputData = inputs # type: ignore + elif input_class_name == "TensorKey": + request.input.inputKeys = inputs # type: ignore + else: + raise ValueError( + "Invalid input class name. Expected 'Tensor' or 'TensorKey'." + ) + except Exception as e: + raise ValueError("Error building inputs portion of request.") from e + + @staticmethod + def _assign_outputs( + request: request_capnp.Request, + outputs: t.List[data_references_capnp.TensorKey], + ) -> None: + """ + Assigns outputs to the supplied request. + + :param request: Request being built + :param outputs: Outputs to be assigned + :raises ValueError: if building fails + """ + try: + request.output = outputs + + except Exception as e: + raise ValueError("Error building outputs portion of request.") from e + + @staticmethod + def _assign_output_descriptors( + request: request_capnp.Request, + output_descriptors: t.List[tensor_capnp.OutputDescriptor], + ) -> None: + """ + Assigns a list of output tensor descriptors to the supplied request. + + :param request: Request being built + :param output_descriptors: Output descriptors to be assigned + :raises ValueError: if building fails + """ + try: + request.outputDescriptors = output_descriptors + except Exception as e: + raise ValueError( + "Error building the output descriptors portion of request." + ) from e + + @staticmethod + def _assign_custom_request_attributes( + request: request_capnp.Request, + custom_attrs: t.Union[ + request_attributes_capnp.TorchRequestAttributes, + request_attributes_capnp.TensorFlowRequestAttributes, + None, + ], + ) -> None: + """ + Assigns request attributes to the supplied request. + + :param request: Request being built + :param custom_attrs: Custom attributes to be assigned + :raises ValueError: if building fails + """ + try: + if custom_attrs is None: + request.customAttributes.none = custom_attrs + else: + custom_attribute_class_name = ( + custom_attrs.schema.node.displayName.split(":")[-1] # type: ignore + ) + if custom_attribute_class_name == "TorchRequestAttributes": + request.customAttributes.torch = custom_attrs # type: ignore + elif custom_attribute_class_name == "TensorFlowRequestAttributes": + request.customAttributes.tf = custom_attrs # type: ignore + else: + raise ValueError("""Invalid custom attribute class name. + Expected 'TensorFlowRequestAttributes' or + 'TorchRequestAttributes'.""") + except Exception as e: + raise ValueError( + "Error building custom attributes portion of request." + ) from e + + @staticmethod + def build_request( + reply_channel: t.ByteString, + model: t.Union[data_references_capnp.ModelKey, t.ByteString], + device: "request_capnp.Device", + inputs: t.Union[ + t.List[data_references_capnp.TensorKey], t.List[tensor_capnp.Tensor] + ], + outputs: t.List[data_references_capnp.TensorKey], + output_descriptors: t.List[tensor_capnp.OutputDescriptor], + custom_attributes: t.Union[ + request_attributes_capnp.TorchRequestAttributes, + request_attributes_capnp.TensorFlowRequestAttributes, + None, + ], + ) -> request_capnp.Request: + """ + Builds the request message. + + :param reply_channel: Reply channel to be assigned to request + :param model: Model to be assigned to request + :param device: Device to be assigned to request + :param inputs: Inputs to be assigned to request + :param outputs: Outputs to be assigned to request + :param output_descriptors: Output descriptors to be assigned to request + :param custom_attributes: Custom attributes to be assigned to request + """ + request = request_capnp.Request.new_message() + MessageHandler._assign_reply_channel(request, reply_channel) + MessageHandler._assign_model(request, model) + MessageHandler._assign_device(request, device) + MessageHandler._assign_inputs(request, inputs) + MessageHandler._assign_outputs(request, outputs) + MessageHandler._assign_output_descriptors(request, output_descriptors) + MessageHandler._assign_custom_request_attributes(request, custom_attributes) + return request + + @staticmethod + def serialize_request(request: request_capnp.RequestBuilder) -> t.ByteString: + """ + Serializes a built request message. + + :param request: Request to be serialized + """ + return request.to_bytes() + + @staticmethod + def deserialize_request(request_bytes: t.ByteString) -> request_capnp.Request: + """ + Deserializes a serialized request message. + + :param request_bytes: Bytes to be deserialized into a Request + """ + bytes_message = request_capnp.Request.from_bytes(request_bytes) + + with bytes_message as message: + return message + + @staticmethod + def _assign_status( + response: response_capnp.Response, status: "response_capnp.StatusEnum" + ) -> None: + """ + Assigns a status to the supplied response. + + :param response: Response being built + :param status: Status to be assigned + :raises ValueError: if building fails + """ + try: + response.status = status + except Exception as e: + raise ValueError("Error assigning status to response.") from e + + @staticmethod + def _assign_message(response: response_capnp.Response, message: str) -> None: + """ + Assigns a message to the supplied response. + + :param response: Response being built + :param message: Message to be assigned + :raises ValueError: if building fails + """ + try: + response.message = message + except Exception as e: + raise ValueError("Error assigning message to response.") from e + + @staticmethod + def _assign_result( + response: response_capnp.Response, + result: t.Union[ + t.List[tensor_capnp.Tensor], t.List[data_references_capnp.TensorKey] + ], + ) -> None: + """ + Assigns a result to the supplied response. + + :param response: Response being built + :param result: Result to be assigned + :raises ValueError: if building fails + """ + try: + if result: + first_result = result[0] + display_name = first_result.schema.node.displayName # type: ignore + result_class_name = display_name.split(":")[-1] + if result_class_name == "Tensor": + response.result.data = result # type: ignore + elif result_class_name == "TensorKey": + response.result.keys = result # type: ignore + else: + raise ValueError("""Invalid custom attribute class name. + Expected 'Tensor' or 'TensorKey'.""") + except Exception as e: + raise ValueError("Error assigning result to response.") from e + + @staticmethod + def _assign_custom_response_attributes( + response: response_capnp.Response, + custom_attrs: t.Union[ + response_attributes_capnp.TorchResponseAttributes, + response_attributes_capnp.TensorFlowResponseAttributes, + None, + ], + ) -> None: + """ + Assigns custom attributes to the supplied response. + + :param response: Response being built + :param custom_attrs: Custom attributes to be assigned + :raises ValueError: if building fails + """ + try: + if custom_attrs is None: + response.customAttributes.none = custom_attrs + else: + custom_attribute_class_name = ( + custom_attrs.schema.node.displayName.split(":")[-1] # type: ignore + ) + if custom_attribute_class_name == "TorchResponseAttributes": + response.customAttributes.torch = custom_attrs # type: ignore + elif custom_attribute_class_name == "TensorFlowResponseAttributes": + response.customAttributes.tf = custom_attrs # type: ignore + else: + raise ValueError("""Invalid custom attribute class name. + Expected 'TensorFlowResponseAttributes' or + 'TorchResponseAttributes'.""") + except Exception as e: + raise ValueError("Error assigning custom attributes to response.") from e + + @staticmethod + def build_response( + status: "response_capnp.StatusEnum", + message: str, + result: t.Union[ + t.List[tensor_capnp.Tensor], t.List[data_references_capnp.TensorKey] + ], + custom_attributes: t.Union[ + response_attributes_capnp.TorchResponseAttributes, + response_attributes_capnp.TensorFlowResponseAttributes, + None, + ], + ) -> response_capnp.Response: + """ + Builds the response message. + + :param status: Status to be assigned to response + :param message: Message to be assigned to response + :param result: Result to be assigned to response + :param custom_attributes: Custom attributes to be assigned to response + """ + response = response_capnp.Response.new_message() + MessageHandler._assign_status(response, status) + MessageHandler._assign_message(response, message) + MessageHandler._assign_result(response, result) + MessageHandler._assign_custom_response_attributes(response, custom_attributes) + return response + + @staticmethod + def serialize_response(response: response_capnp.ResponseBuilder) -> t.ByteString: + """ + Serializes a built response message. + """ + return response.to_bytes() + + @staticmethod + def deserialize_response(response_bytes: t.ByteString) -> response_capnp.Response: + """ + Deserializes a serialized response message. + """ + bytes_message = response_capnp.Response.from_bytes(response_bytes) + + with bytes_message as message: + return message diff --git a/smartsim/_core/mli/mli_schemas/data/data_references.capnp b/smartsim/_core/mli/mli_schemas/data/data_references.capnp new file mode 100644 index 0000000000..fa35989b32 --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/data/data_references.capnp @@ -0,0 +1,35 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +@0x8ca69fd1aacb6668; + +struct ModelKey { + key @0 :Text; +} + +struct TensorKey { + key @0 :Text; +} \ No newline at end of file diff --git a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.py b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.py new file mode 100644 index 0000000000..de3f080116 --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.py @@ -0,0 +1,15 @@ +"""This is an automatically generated stub for `data_references.capnp`.""" + +import os + +import capnp # type: ignore + +capnp.remove_import_hook() +here = os.path.dirname(os.path.abspath(__file__)) +module_file = os.path.abspath(os.path.join(here, "data_references.capnp")) +ModelKey = capnp.load(module_file).ModelKey +ModelKeyBuilder = ModelKey +ModelKeyReader = ModelKey +TensorKey = capnp.load(module_file).TensorKey +TensorKeyBuilder = TensorKey +TensorKeyReader = TensorKey diff --git a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi new file mode 100644 index 0000000000..0e0edb8f99 --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi @@ -0,0 +1,79 @@ +"""This is an automatically generated stub for `data_references.capnp`.""" + +# mypy: ignore-errors + +from __future__ import annotations + +from contextlib import contextmanager +from io import BufferedWriter +from typing import Iterator + +class ModelKey: + key: str + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[ModelKeyReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> ModelKeyReader: ... + @staticmethod + def new_message() -> ModelKeyBuilder: ... + def to_dict(self) -> dict: ... + +class ModelKeyReader(ModelKey): + def as_builder(self) -> ModelKeyBuilder: ... + +class ModelKeyBuilder(ModelKey): + @staticmethod + def from_dict(dictionary: dict) -> ModelKeyBuilder: ... + def copy(self) -> ModelKeyBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> ModelKeyReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... + +class TensorKey: + key: str + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[TensorKeyReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> TensorKeyReader: ... + @staticmethod + def new_message() -> TensorKeyBuilder: ... + def to_dict(self) -> dict: ... + +class TensorKeyReader(TensorKey): + def as_builder(self) -> TensorKeyBuilder: ... + +class TensorKeyBuilder(TensorKey): + @staticmethod + def from_dict(dictionary: dict) -> TensorKeyBuilder: ... + def copy(self) -> TensorKeyBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> TensorKeyReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... diff --git a/smartsim/_core/mli/mli_schemas/request/request.capnp b/smartsim/_core/mli/mli_schemas/request/request.capnp new file mode 100644 index 0000000000..446c628a4c --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/request/request.capnp @@ -0,0 +1,61 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +@0xa27f0152c7bb299e; + +using Tensors = import "../tensor/tensor.capnp"; +using RequestAttributes = import "request_attributes/request_attributes.capnp"; +using DataRef = import "../data/data_references.capnp"; + +enum Device { + cpu @0; + gpu @1; + auto @2; +} + +struct ChannelDescriptor { + reply @0 :Data; +} + +struct Request { + replyChannel @0 :ChannelDescriptor; + model :union { + modelKey @1 :DataRef.ModelKey; + modelData @2 :Data; + } + device @3 :Device; + input :union { + inputKeys @4 :List(DataRef.TensorKey); + inputData @5 :List(Tensors.Tensor); + } + output @6 :List(DataRef.TensorKey); + outputDescriptors @7 :List(Tensors.OutputDescriptor); + customAttributes :union { + torch @8 :RequestAttributes.TorchRequestAttributes; + tf @9 :RequestAttributes.TensorFlowRequestAttributes; + none @10 :Void; + } +} \ No newline at end of file diff --git a/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes.capnp b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes.capnp new file mode 100644 index 0000000000..bc1af14d12 --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes.capnp @@ -0,0 +1,49 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +@0xdd14d8ba5c06743f; + +enum TorchTensorType { + nested @0; # ragged + sparse @1; + tensor @2; # "normal" tensor +} + +enum TFTensorType { + ragged @0; + sparse @1; + variable @2; + constant @3; +} + +struct TorchRequestAttributes { + tensorType @0 :TorchTensorType; +} + +struct TensorFlowRequestAttributes { + name @0 :Text; + tensorType @1 :TFTensorType; +} \ No newline at end of file diff --git a/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.py b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.py new file mode 100644 index 0000000000..446ee6541f --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.py @@ -0,0 +1,15 @@ +"""This is an automatically generated stub for `request_attributes.capnp`.""" + +import os + +import capnp # type: ignore + +capnp.remove_import_hook() +here = os.path.dirname(os.path.abspath(__file__)) +module_file = os.path.abspath(os.path.join(here, "request_attributes.capnp")) +TorchRequestAttributes = capnp.load(module_file).TorchRequestAttributes +TorchRequestAttributesBuilder = TorchRequestAttributes +TorchRequestAttributesReader = TorchRequestAttributes +TensorFlowRequestAttributes = capnp.load(module_file).TensorFlowRequestAttributes +TensorFlowRequestAttributesBuilder = TensorFlowRequestAttributes +TensorFlowRequestAttributesReader = TensorFlowRequestAttributes diff --git a/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.pyi b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.pyi new file mode 100644 index 0000000000..977c3e6a09 --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.pyi @@ -0,0 +1,83 @@ +"""This is an automatically generated stub for `request_attributes.capnp`.""" + +# mypy: ignore-errors + +from __future__ import annotations + +from contextlib import contextmanager +from io import BufferedWriter +from typing import Iterator, Literal + +TorchTensorType = Literal["nested", "sparse", "tensor"] +TFTensorType = Literal["ragged", "sparse", "variable", "constant"] + +class TorchRequestAttributes: + tensorType: TorchTensorType + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[TorchRequestAttributesReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> TorchRequestAttributesReader: ... + @staticmethod + def new_message() -> TorchRequestAttributesBuilder: ... + def to_dict(self) -> dict: ... + +class TorchRequestAttributesReader(TorchRequestAttributes): + def as_builder(self) -> TorchRequestAttributesBuilder: ... + +class TorchRequestAttributesBuilder(TorchRequestAttributes): + @staticmethod + def from_dict(dictionary: dict) -> TorchRequestAttributesBuilder: ... + def copy(self) -> TorchRequestAttributesBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> TorchRequestAttributesReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... + +class TensorFlowRequestAttributes: + name: str + tensorType: TFTensorType + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[TensorFlowRequestAttributesReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> TensorFlowRequestAttributesReader: ... + @staticmethod + def new_message() -> TensorFlowRequestAttributesBuilder: ... + def to_dict(self) -> dict: ... + +class TensorFlowRequestAttributesReader(TensorFlowRequestAttributes): + def as_builder(self) -> TensorFlowRequestAttributesBuilder: ... + +class TensorFlowRequestAttributesBuilder(TensorFlowRequestAttributes): + @staticmethod + def from_dict(dictionary: dict) -> TensorFlowRequestAttributesBuilder: ... + def copy(self) -> TensorFlowRequestAttributesBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> TensorFlowRequestAttributesReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... diff --git a/smartsim/_core/mli/mli_schemas/request/request_capnp.py b/smartsim/_core/mli/mli_schemas/request/request_capnp.py new file mode 100644 index 0000000000..d8370b662d --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/request/request_capnp.py @@ -0,0 +1,15 @@ +"""This is an automatically generated stub for `request.capnp`.""" + +import os + +import capnp # type: ignore + +capnp.remove_import_hook() +here = os.path.dirname(os.path.abspath(__file__)) +module_file = os.path.abspath(os.path.join(here, "request.capnp")) +ChannelDescriptor = capnp.load(module_file).ChannelDescriptor +ChannelDescriptorBuilder = ChannelDescriptor +ChannelDescriptorReader = ChannelDescriptor +Request = capnp.load(module_file).Request +RequestBuilder = Request +RequestReader = Request diff --git a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi new file mode 100644 index 0000000000..5d622d4e6d --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi @@ -0,0 +1,286 @@ +"""This is an automatically generated stub for `request.capnp`.""" + +# mypy: ignore-errors + +from __future__ import annotations + +from contextlib import contextmanager +from io import BufferedWriter +from typing import Iterator, Literal, Sequence, overload + +from ..data.data_references_capnp import ( + ModelKey, + ModelKeyBuilder, + ModelKeyReader, + TensorKey, + TensorKeyBuilder, + TensorKeyReader, +) +from ..tensor.tensor_capnp import ( + OutputDescriptor, + OutputDescriptorBuilder, + OutputDescriptorReader, + Tensor, + TensorBuilder, + TensorReader, +) +from .request_attributes.request_attributes_capnp import ( + TensorFlowRequestAttributes, + TensorFlowRequestAttributesBuilder, + TensorFlowRequestAttributesReader, + TorchRequestAttributes, + TorchRequestAttributesBuilder, + TorchRequestAttributesReader, +) + +Device = Literal["cpu", "gpu", "auto"] + +class ChannelDescriptor: + reply: bytes + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[ChannelDescriptorReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> ChannelDescriptorReader: ... + @staticmethod + def new_message() -> ChannelDescriptorBuilder: ... + def to_dict(self) -> dict: ... + +class ChannelDescriptorReader(ChannelDescriptor): + def as_builder(self) -> ChannelDescriptorBuilder: ... + +class ChannelDescriptorBuilder(ChannelDescriptor): + @staticmethod + def from_dict(dictionary: dict) -> ChannelDescriptorBuilder: ... + def copy(self) -> ChannelDescriptorBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> ChannelDescriptorReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... + +class Request: + class Model: + modelKey: ModelKey | ModelKeyBuilder | ModelKeyReader + modelData: bytes + def which(self) -> Literal["modelKey", "modelData"]: ... + def init(self, name: Literal["modelKey"]) -> ModelKey: ... + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[Request.ModelReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Request.ModelReader: ... + @staticmethod + def new_message() -> Request.ModelBuilder: ... + def to_dict(self) -> dict: ... + + class ModelReader(Request.Model): + modelKey: ModelKeyReader + def as_builder(self) -> Request.ModelBuilder: ... + + class ModelBuilder(Request.Model): + modelKey: ModelKey | ModelKeyBuilder | ModelKeyReader + @staticmethod + def from_dict(dictionary: dict) -> Request.ModelBuilder: ... + def copy(self) -> Request.ModelBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> Request.ModelReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... + + class Input: + inputKeys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] + inputData: Sequence[Tensor | TensorBuilder | TensorReader] + def which(self) -> Literal["inputKeys", "inputData"]: ... + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[Request.InputReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Request.InputReader: ... + @staticmethod + def new_message() -> Request.InputBuilder: ... + def to_dict(self) -> dict: ... + + class InputReader(Request.Input): + inputKeys: Sequence[TensorKeyReader] + inputData: Sequence[TensorReader] + def as_builder(self) -> Request.InputBuilder: ... + + class InputBuilder(Request.Input): + inputKeys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] + inputData: Sequence[Tensor | TensorBuilder | TensorReader] + @staticmethod + def from_dict(dictionary: dict) -> Request.InputBuilder: ... + def copy(self) -> Request.InputBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> Request.InputReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... + + class CustomAttributes: + torch: ( + TorchRequestAttributes + | TorchRequestAttributesBuilder + | TorchRequestAttributesReader + ) + tf: ( + TensorFlowRequestAttributes + | TensorFlowRequestAttributesBuilder + | TensorFlowRequestAttributesReader + ) + none: None + def which(self) -> Literal["torch", "tf", "none"]: ... + @overload + def init(self, name: Literal["torch"]) -> TorchRequestAttributes: ... + @overload + def init(self, name: Literal["tf"]) -> TensorFlowRequestAttributes: ... + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[Request.CustomAttributesReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Request.CustomAttributesReader: ... + @staticmethod + def new_message() -> Request.CustomAttributesBuilder: ... + def to_dict(self) -> dict: ... + + class CustomAttributesReader(Request.CustomAttributes): + torch: TorchRequestAttributesReader + tf: TensorFlowRequestAttributesReader + def as_builder(self) -> Request.CustomAttributesBuilder: ... + + class CustomAttributesBuilder(Request.CustomAttributes): + torch: ( + TorchRequestAttributes + | TorchRequestAttributesBuilder + | TorchRequestAttributesReader + ) + tf: ( + TensorFlowRequestAttributes + | TensorFlowRequestAttributesBuilder + | TensorFlowRequestAttributesReader + ) + @staticmethod + def from_dict(dictionary: dict) -> Request.CustomAttributesBuilder: ... + def copy(self) -> Request.CustomAttributesBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> Request.CustomAttributesReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... + replyChannel: ChannelDescriptor | ChannelDescriptorBuilder | ChannelDescriptorReader + model: Request.Model | Request.ModelBuilder | Request.ModelReader + device: Device + input: Request.Input | Request.InputBuilder | Request.InputReader + output: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] + outputDescriptors: Sequence[ + OutputDescriptor | OutputDescriptorBuilder | OutputDescriptorReader + ] + customAttributes: ( + Request.CustomAttributes + | Request.CustomAttributesBuilder + | Request.CustomAttributesReader + ) + @overload + def init(self, name: Literal["replyChannel"]) -> ChannelDescriptor: ... + @overload + def init(self, name: Literal["model"]) -> Model: ... + @overload + def init(self, name: Literal["input"]) -> Input: ... + @overload + def init(self, name: Literal["customAttributes"]) -> CustomAttributes: ... + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[RequestReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> RequestReader: ... + @staticmethod + def new_message() -> RequestBuilder: ... + def to_dict(self) -> dict: ... + +class RequestReader(Request): + replyChannel: ChannelDescriptorReader + model: Request.ModelReader + input: Request.InputReader + output: Sequence[TensorKeyReader] + outputDescriptors: Sequence[OutputDescriptorReader] + customAttributes: Request.CustomAttributesReader + def as_builder(self) -> RequestBuilder: ... + +class RequestBuilder(Request): + replyChannel: ChannelDescriptor | ChannelDescriptorBuilder | ChannelDescriptorReader + model: Request.Model | Request.ModelBuilder | Request.ModelReader + input: Request.Input | Request.InputBuilder | Request.InputReader + output: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] + outputDescriptors: Sequence[ + OutputDescriptor | OutputDescriptorBuilder | OutputDescriptorReader + ] + customAttributes: ( + Request.CustomAttributes + | Request.CustomAttributesBuilder + | Request.CustomAttributesReader + ) + @staticmethod + def from_dict(dictionary: dict) -> RequestBuilder: ... + def copy(self) -> RequestBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> RequestReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... diff --git a/smartsim/_core/mli/mli_schemas/response/response.capnp b/smartsim/_core/mli/mli_schemas/response/response.capnp new file mode 100644 index 0000000000..0c5cee1a1c --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/response/response.capnp @@ -0,0 +1,51 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +@0xa05dcb4444780705; + +using Tensors = import "../tensor/tensor.capnp"; +using ResponseAttributes = import "response_attributes/response_attributes.capnp"; +using DataRef = import "../data/data_references.capnp"; + +enum StatusEnum { + complete @0; + fail @1; + timeout @2; +} + +struct Response { + status @0 :StatusEnum; + message @1 :Text; + result :union { + keys @2 :List(DataRef.TensorKey); + data @3 :List(Tensors.Tensor); + } + customAttributes :union { + torch @4 :ResponseAttributes.TorchResponseAttributes; + tf @5 :ResponseAttributes.TensorFlowResponseAttributes; + none @6 :Void; + } +} \ No newline at end of file diff --git a/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes.capnp b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes.capnp new file mode 100644 index 0000000000..59acd60312 --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes.capnp @@ -0,0 +1,33 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +@0xee59c60fccbb1bf9; + +struct TorchResponseAttributes { +} + +struct TensorFlowResponseAttributes { +} \ No newline at end of file diff --git a/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.py b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.py new file mode 100644 index 0000000000..3df1115b47 --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.py @@ -0,0 +1,15 @@ +"""This is an automatically generated stub for `response_attributes.capnp`.""" + +import os + +import capnp # type: ignore + +capnp.remove_import_hook() +here = os.path.dirname(os.path.abspath(__file__)) +module_file = os.path.abspath(os.path.join(here, "response_attributes.capnp")) +TorchResponseAttributes = capnp.load(module_file).TorchResponseAttributes +TorchResponseAttributesBuilder = TorchResponseAttributes +TorchResponseAttributesReader = TorchResponseAttributes +TensorFlowResponseAttributes = capnp.load(module_file).TensorFlowResponseAttributes +TensorFlowResponseAttributesBuilder = TensorFlowResponseAttributes +TensorFlowResponseAttributesReader = TensorFlowResponseAttributes diff --git a/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.pyi b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.pyi new file mode 100644 index 0000000000..63c2218ff4 --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.pyi @@ -0,0 +1,77 @@ +"""This is an automatically generated stub for `response_attributes.capnp`.""" + +# mypy: ignore-errors + +from __future__ import annotations + +from contextlib import contextmanager +from io import BufferedWriter +from typing import Iterator + +class TorchResponseAttributes: + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[TorchResponseAttributesReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> TorchResponseAttributesReader: ... + @staticmethod + def new_message() -> TorchResponseAttributesBuilder: ... + def to_dict(self) -> dict: ... + +class TorchResponseAttributesReader(TorchResponseAttributes): + def as_builder(self) -> TorchResponseAttributesBuilder: ... + +class TorchResponseAttributesBuilder(TorchResponseAttributes): + @staticmethod + def from_dict(dictionary: dict) -> TorchResponseAttributesBuilder: ... + def copy(self) -> TorchResponseAttributesBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> TorchResponseAttributesReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... + +class TensorFlowResponseAttributes: + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[TensorFlowResponseAttributesReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> TensorFlowResponseAttributesReader: ... + @staticmethod + def new_message() -> TensorFlowResponseAttributesBuilder: ... + def to_dict(self) -> dict: ... + +class TensorFlowResponseAttributesReader(TensorFlowResponseAttributes): + def as_builder(self) -> TensorFlowResponseAttributesBuilder: ... + +class TensorFlowResponseAttributesBuilder(TensorFlowResponseAttributes): + @staticmethod + def from_dict(dictionary: dict) -> TensorFlowResponseAttributesBuilder: ... + def copy(self) -> TensorFlowResponseAttributesBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> TensorFlowResponseAttributesReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... diff --git a/smartsim/_core/mli/mli_schemas/response/response_capnp.py b/smartsim/_core/mli/mli_schemas/response/response_capnp.py new file mode 100644 index 0000000000..5762408272 --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/response/response_capnp.py @@ -0,0 +1,12 @@ +"""This is an automatically generated stub for `response.capnp`.""" + +import os + +import capnp # type: ignore + +capnp.remove_import_hook() +here = os.path.dirname(os.path.abspath(__file__)) +module_file = os.path.abspath(os.path.join(here, "response.capnp")) +Response = capnp.load(module_file).Response +ResponseBuilder = Response +ResponseReader = Response diff --git a/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi b/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi new file mode 100644 index 0000000000..194c50d1c5 --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi @@ -0,0 +1,178 @@ +"""This is an automatically generated stub for `response.capnp`.""" + +# mypy: ignore-errors + +from __future__ import annotations + +from contextlib import contextmanager +from io import BufferedWriter +from typing import Iterator, Literal, Sequence, overload + +from ..data.data_references_capnp import TensorKey, TensorKeyBuilder, TensorKeyReader +from ..tensor.tensor_capnp import Tensor, TensorBuilder, TensorReader +from .response_attributes.response_attributes_capnp import ( + TensorFlowResponseAttributes, + TensorFlowResponseAttributesBuilder, + TensorFlowResponseAttributesReader, + TorchResponseAttributes, + TorchResponseAttributesBuilder, + TorchResponseAttributesReader, +) + +StatusEnum = Literal["complete", "fail", "timeout"] + +class Response: + class Result: + keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] + data: Sequence[Tensor | TensorBuilder | TensorReader] + def which(self) -> Literal["keys", "data"]: ... + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[Response.ResultReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Response.ResultReader: ... + @staticmethod + def new_message() -> Response.ResultBuilder: ... + def to_dict(self) -> dict: ... + + class ResultReader(Response.Result): + keys: Sequence[TensorKeyReader] + data: Sequence[TensorReader] + def as_builder(self) -> Response.ResultBuilder: ... + + class ResultBuilder(Response.Result): + keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] + data: Sequence[Tensor | TensorBuilder | TensorReader] + @staticmethod + def from_dict(dictionary: dict) -> Response.ResultBuilder: ... + def copy(self) -> Response.ResultBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> Response.ResultReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... + + class CustomAttributes: + torch: ( + TorchResponseAttributes + | TorchResponseAttributesBuilder + | TorchResponseAttributesReader + ) + tf: ( + TensorFlowResponseAttributes + | TensorFlowResponseAttributesBuilder + | TensorFlowResponseAttributesReader + ) + none: None + def which(self) -> Literal["torch", "tf", "none"]: ... + @overload + def init(self, name: Literal["torch"]) -> TorchResponseAttributes: ... + @overload + def init(self, name: Literal["tf"]) -> TensorFlowResponseAttributes: ... + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[Response.CustomAttributesReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Response.CustomAttributesReader: ... + @staticmethod + def new_message() -> Response.CustomAttributesBuilder: ... + def to_dict(self) -> dict: ... + + class CustomAttributesReader(Response.CustomAttributes): + torch: TorchResponseAttributesReader + tf: TensorFlowResponseAttributesReader + def as_builder(self) -> Response.CustomAttributesBuilder: ... + + class CustomAttributesBuilder(Response.CustomAttributes): + torch: ( + TorchResponseAttributes + | TorchResponseAttributesBuilder + | TorchResponseAttributesReader + ) + tf: ( + TensorFlowResponseAttributes + | TensorFlowResponseAttributesBuilder + | TensorFlowResponseAttributesReader + ) + @staticmethod + def from_dict(dictionary: dict) -> Response.CustomAttributesBuilder: ... + def copy(self) -> Response.CustomAttributesBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> Response.CustomAttributesReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... + status: StatusEnum + message: str + result: Response.Result | Response.ResultBuilder | Response.ResultReader + customAttributes: ( + Response.CustomAttributes + | Response.CustomAttributesBuilder + | Response.CustomAttributesReader + ) + @overload + def init(self, name: Literal["result"]) -> Result: ... + @overload + def init(self, name: Literal["customAttributes"]) -> CustomAttributes: ... + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[ResponseReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> ResponseReader: ... + @staticmethod + def new_message() -> ResponseBuilder: ... + def to_dict(self) -> dict: ... + +class ResponseReader(Response): + result: Response.ResultReader + customAttributes: Response.CustomAttributesReader + def as_builder(self) -> ResponseBuilder: ... + +class ResponseBuilder(Response): + result: Response.Result | Response.ResultBuilder | Response.ResultReader + customAttributes: ( + Response.CustomAttributes + | Response.CustomAttributesBuilder + | Response.CustomAttributesReader + ) + @staticmethod + def from_dict(dictionary: dict) -> ResponseBuilder: ... + def copy(self) -> ResponseBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> ResponseReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp b/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp new file mode 100644 index 0000000000..0097a0f9bb --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp @@ -0,0 +1,80 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +@0x9a0aeb2e04838fb1; + +using DataRef = import "../data/data_references.capnp"; + +enum Order { + c @0; # row major (contiguous layout) + f @1; # column major (fortran contiguous layout) +} + +enum NumericalType { + int8 @0; + int16 @1; + int32 @2; + int64 @3; + uInt8 @4; + uInt16 @5; + uInt32 @6; + uInt64 @7; + float32 @8; + float64 @9; +} + +enum ReturnNumericalType { + int8 @0; + int16 @1; + int32 @2; + int64 @3; + uInt8 @4; + uInt16 @5; + uInt32 @6; + uInt64 @7; + float32 @8; + float64 @9; + none @10; + auto @ 11; +} + +struct Tensor { + blob @0 :Data; + tensorDescriptor @1 :TensorDescriptor; +} + +struct TensorDescriptor { + dimensions @0 :List(Int32); + order @1 :Order; + dataType @2 :NumericalType; +} + +struct OutputDescriptor { + order @0 :Order; + optionalKeys @1 :List(DataRef.TensorKey); + optionalDimension @2 :List(Int32); + optionalDatatype @3 :ReturnNumericalType; +} \ No newline at end of file diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py new file mode 100644 index 0000000000..a3938bda53 --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py @@ -0,0 +1,18 @@ +"""This is an automatically generated stub for `tensor.capnp`.""" + +import os + +import capnp # type: ignore + +capnp.remove_import_hook() +here = os.path.dirname(os.path.abspath(__file__)) +module_file = os.path.abspath(os.path.join(here, "tensor.capnp")) +Tensor = capnp.load(module_file).Tensor +TensorBuilder = Tensor +TensorReader = Tensor +TensorDescriptor = capnp.load(module_file).TensorDescriptor +TensorDescriptorBuilder = TensorDescriptor +TensorDescriptorReader = TensorDescriptor +OutputDescriptor = capnp.load(module_file).OutputDescriptor +OutputDescriptorBuilder = OutputDescriptor +OutputDescriptorReader = OutputDescriptor diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi new file mode 100644 index 0000000000..462911afdf --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi @@ -0,0 +1,159 @@ +"""This is an automatically generated stub for `tensor.capnp`.""" + +# mypy: ignore-errors + +from __future__ import annotations + +from contextlib import contextmanager +from io import BufferedWriter +from typing import Iterator, Literal, Sequence + +from ..data.data_references_capnp import TensorKey, TensorKeyBuilder, TensorKeyReader + +Order = Literal["c", "f"] +NumericalType = Literal[ + "int8", + "int16", + "int32", + "int64", + "uInt8", + "uInt16", + "uInt32", + "uInt64", + "float32", + "float64", +] +ReturnNumericalType = Literal[ + "int8", + "int16", + "int32", + "int64", + "uInt8", + "uInt16", + "uInt32", + "uInt64", + "float32", + "float64", + "none", + "auto", +] + +class TensorDescriptor: + dimensions: Sequence[int] + order: Order + dataType: NumericalType + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[TensorDescriptorReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> TensorDescriptorReader: ... + @staticmethod + def new_message() -> TensorDescriptorBuilder: ... + def to_dict(self) -> dict: ... + +class TensorDescriptorReader(TensorDescriptor): + def as_builder(self) -> TensorDescriptorBuilder: ... + +class TensorDescriptorBuilder(TensorDescriptor): + @staticmethod + def from_dict(dictionary: dict) -> TensorDescriptorBuilder: ... + def copy(self) -> TensorDescriptorBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> TensorDescriptorReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... + +class Tensor: + blob: bytes + tensorDescriptor: ( + TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader + ) + def init(self, name: Literal["tensorDescriptor"]) -> TensorDescriptor: ... + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[TensorReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> TensorReader: ... + @staticmethod + def new_message() -> TensorBuilder: ... + def to_dict(self) -> dict: ... + +class TensorReader(Tensor): + tensorDescriptor: TensorDescriptorReader + def as_builder(self) -> TensorBuilder: ... + +class TensorBuilder(Tensor): + tensorDescriptor: ( + TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader + ) + @staticmethod + def from_dict(dictionary: dict) -> TensorBuilder: ... + def copy(self) -> TensorBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> TensorReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... + +class OutputDescriptor: + order: Order + optionalKeys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] + optionalDimension: Sequence[int] + optionalDatatype: ReturnNumericalType + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[OutputDescriptorReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> OutputDescriptorReader: ... + @staticmethod + def new_message() -> OutputDescriptorBuilder: ... + def to_dict(self) -> dict: ... + +class OutputDescriptorReader(OutputDescriptor): + optionalKeys: Sequence[TensorKeyReader] + def as_builder(self) -> OutputDescriptorBuilder: ... + +class OutputDescriptorBuilder(OutputDescriptor): + optionalKeys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] + @staticmethod + def from_dict(dictionary: dict) -> OutputDescriptorBuilder: ... + def copy(self) -> OutputDescriptorBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> OutputDescriptorReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... diff --git a/tests/test_message_handler/__init__.py b/tests/test_message_handler/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_message_handler/test_build_model_key.py b/tests/test_message_handler/test_build_model_key.py new file mode 100644 index 0000000000..135e967983 --- /dev/null +++ b/tests/test_message_handler/test_build_model_key.py @@ -0,0 +1,44 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from smartsim._core.mli.message_handler import MessageHandler + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +handler = MessageHandler() + + +def test_build_model_key_successful(): + model_key = handler.build_model_key("tensor_key") + assert model_key.key == "tensor_key" + + +def test_build_model_key_unsuccessful(): + with pytest.raises(ValueError): + model_key = handler.build_model_key(100) diff --git a/tests/test_message_handler/test_build_request_attributes.py b/tests/test_message_handler/test_build_request_attributes.py new file mode 100644 index 0000000000..5b1e09b0aa --- /dev/null +++ b/tests/test_message_handler/test_build_request_attributes.py @@ -0,0 +1,55 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from smartsim._core.mli.message_handler import MessageHandler + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +handler = MessageHandler() + + +def test_build_torch_request_attributes_successful(): + attribute = handler.build_torch_request_attributes("sparse") + assert attribute.tensorType == "sparse" + + +def test_build_torch_request_attributes_unsuccessful(): + with pytest.raises(ValueError): + attribute = handler.build_torch_request_attributes("invalid!") + + +def test_build_tf_request_attributes_successful(): + attribute = handler.build_tf_request_attributes(name="tfcnn", tensor_type="sparse") + assert attribute.tensorType == "sparse" + assert attribute.name == "tfcnn" + + +def test_build_tf_request_attributes_unsuccessful(): + with pytest.raises(ValueError): + attribute = handler.build_tf_request_attributes("tf_fail", "invalid!") diff --git a/tests/test_message_handler/test_build_tensor.py b/tests/test_message_handler/test_build_tensor.py new file mode 100644 index 0000000000..aa7bd4e6e2 --- /dev/null +++ b/tests/test_message_handler/test_build_tensor.py @@ -0,0 +1,185 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +try: + import tensorflow as tf +except ImportError: + should_run_tf = False +else: + should_run_tf = True + + small_tf_tensor = tf.zeros((3, 2, 5), dtype=tf.int8) + small_tf_tensor = small_tf_tensor.numpy() + medium_tf_tensor = tf.ones((1040, 1040, 3), dtype=tf.int64) + medium_tf_tensor = medium_tf_tensor.numpy() + + +try: + import torch +except ImportError: + should_run_torch = False +else: + should_run_torch = True + + small_torch_tensor = torch.zeros((3, 2, 5), dtype=torch.int8) + small_torch_tensor = small_torch_tensor.numpy() + medium_torch_tensor = torch.ones((1040, 1040, 3), dtype=torch.int64) + medium_torch_tensor = medium_torch_tensor.numpy() + +from smartsim._core.mli.message_handler import MessageHandler + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +handler = MessageHandler() + + +@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") +@pytest.mark.parametrize( + "tensor, dtype, order, dimension", + [ + pytest.param( + small_torch_tensor, + "int8", + "c", + [3, 2, 5], + id="small torch tensor", + ), + pytest.param( + medium_torch_tensor, + "int64", + "c", + [1040, 1040, 3], + id="medium torch tensor", + ), + ], +) +def test_build_torch_tensor_successful(tensor, dtype, order, dimension): + built_tensor = handler.build_tensor(tensor, order, dtype, dimension) + assert built_tensor is not None + assert type(built_tensor.blob) == bytes + assert built_tensor.tensorDescriptor.order == order + assert built_tensor.tensorDescriptor.dataType == dtype + for i, j in zip(built_tensor.tensorDescriptor.dimensions, dimension): + assert i == j + + +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") +@pytest.mark.parametrize( + "tensor, dtype, order, dimension", + [ + pytest.param( + small_tf_tensor, + "int8", + "c", + [3, 2, 5], + id="small tf tensor", + ), + pytest.param( + medium_tf_tensor, + "int64", + "c", + [1040, 1040, 3], + id="medium tf tensor", + ), + ], +) +def test_build_tf_tensor_successful(tensor, dtype, order, dimension): + built_tensor = handler.build_tensor(tensor, order, dtype, dimension) + assert built_tensor is not None + assert type(built_tensor.blob) == bytes + assert built_tensor.tensorDescriptor.order == order + assert built_tensor.tensorDescriptor.dataType == dtype + for i, j in zip(built_tensor.tensorDescriptor.dimensions, dimension): + assert i == j + + +@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") +@pytest.mark.parametrize( + "tensor, dtype, order, dimension", + [ + pytest.param([1, 2, 4], "c", "int8", [1, 2, 3], id="bad tensor type"), + pytest.param( + small_torch_tensor, + "bad_order", + "int8", + [3, 2, 5], + id="bad order type", + ), + pytest.param( + small_torch_tensor, + "f", + "bad_num_type", + [3, 2, 5], + id="bad numerical type", + ), + pytest.param( + small_torch_tensor, + "f", + "int8", + "bad shape type", + id="bad shape type", + ), + ], +) +def test_build_torch_tensor_bad_input(tensor, dtype, order, dimension): + with pytest.raises(ValueError): + built_tensor = handler.build_tensor(tensor, order, dtype, dimension) + + +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") +@pytest.mark.parametrize( + "tensor, dtype, order, dimension", + [ + pytest.param([1, 2, 4], "c", "int8", [1, 2, 3], id="bad tensor type"), + pytest.param( + small_tf_tensor, + "bad_order", + "int8", + [3, 2, 5], + id="bad order type", + ), + pytest.param( + small_tf_tensor, + "f", + "bad_num_type", + [3, 2, 5], + id="bad numerical type", + ), + pytest.param( + small_tf_tensor, + "f", + "int8", + "bad shape type", + id="bad shape type", + ), + ], +) +def test_build_tf_tensor_bad_input(tensor, dtype, order, dimension): + with pytest.raises(ValueError): + built_tensor = handler.build_tensor(tensor, order, dtype, dimension) diff --git a/tests/test_message_handler/test_build_tensor_key.py b/tests/test_message_handler/test_build_tensor_key.py new file mode 100644 index 0000000000..7abe9e853d --- /dev/null +++ b/tests/test_message_handler/test_build_tensor_key.py @@ -0,0 +1,44 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from smartsim._core.mli.message_handler import MessageHandler + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +handler = MessageHandler() + + +def test_build_tensor_key_successful(): + tensor_key = handler.build_tensor_key("tensor_key") + assert tensor_key.key == "tensor_key" + + +def test_build_tensor_key_unsuccessful(): + with pytest.raises(ValueError): + tensor_key = handler.build_tensor_key(100) diff --git a/tests/test_message_handler/test_output_descriptor.py b/tests/test_message_handler/test_output_descriptor.py new file mode 100644 index 0000000000..fd21eeb0d5 --- /dev/null +++ b/tests/test_message_handler/test_output_descriptor.py @@ -0,0 +1,77 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from smartsim._core.mli.message_handler import MessageHandler + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +handler = MessageHandler() + +tensor_key = handler.build_tensor_key("key") + + +@pytest.mark.parametrize( + "order, keys, dtype, dimension", + [ + pytest.param("c", [tensor_key], "int8", [1, 2, 3, 4], id="all specified"), + pytest.param( + "c", [tensor_key, tensor_key], "none", [1, 2, 3, 4], id="none dtype" + ), + pytest.param("c", [tensor_key], "int8", [], id="empty dimensions"), + pytest.param("c", [], "int8", [1, 2, 3, 4], id="empty keys"), + ], +) +def test_build_output_tensor_descriptor_successful(dtype, keys, order, dimension): + built_descriptor = handler.build_output_tensor_descriptor( + order, keys, dtype, dimension + ) + assert built_descriptor is not None + assert built_descriptor.order == order + assert len(built_descriptor.optionalKeys) == len(keys) + assert built_descriptor.optionalDatatype == dtype + for i, j in zip(built_descriptor.optionalDimension, dimension): + assert i == j + + +@pytest.mark.parametrize( + "order, keys, dtype, dimension", + [ + pytest.param("bad_order", [], "int8", [3, 2, 5], id="bad order type"), + pytest.param( + "f", [tensor_key], "bad_num_type", [3, 2, 5], id="bad numerical type" + ), + pytest.param("f", [tensor_key], "int8", "bad shape type", id="bad shape type"), + pytest.param("f", ["tensor_key"], "int8", [3, 2, 5], id="bad key type"), + ], +) +def test_build_output_tensor_descriptor_unsuccessful(order, keys, dtype, dimension): + with pytest.raises(ValueError): + built_tensor = handler.build_output_tensor_descriptor( + order, keys, dtype, dimension + ) diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py new file mode 100644 index 0000000000..d33a0376a8 --- /dev/null +++ b/tests/test_message_handler/test_request.py @@ -0,0 +1,906 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from smartsim._core.mli.message_handler import MessageHandler + +try: + import tensorflow as tf +except ImportError: + should_run_tf = False +else: + should_run_tf = True + tflow1 = tf.zeros((3, 2, 5), dtype=tf.int8) + tflow2 = tf.ones((10, 10, 3), dtype=tf.int64) + + tensor_3 = MessageHandler.build_tensor( + tflow1.numpy(), "c", "int8", list(tflow1.shape) + ) + tensor_4 = MessageHandler.build_tensor( + tflow2.numpy(), "c", "int64", list(tflow2.shape) + ) + + tf_attributes = MessageHandler.build_tf_request_attributes( + name="tf", tensor_type="sparse" + ) + + +try: + import torch +except ImportError: + should_run_torch = False +else: + should_run_torch = True + + torch1 = torch.zeros((3, 2, 5), dtype=torch.int8) + torch2 = torch.ones((10, 10, 3), dtype=torch.int64) + + tensor_1 = MessageHandler.build_tensor( + torch1.numpy(), "c", "int8", list(torch1.shape) + ) + tensor_2 = MessageHandler.build_tensor( + torch2.numpy(), "c", "int64", list(torch2.shape) + ) + + torch_attributes = MessageHandler.build_torch_request_attributes("sparse") + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +model_key = MessageHandler.build_model_key("model_key") + +input_key1 = MessageHandler.build_tensor_key("input_key1") +input_key2 = MessageHandler.build_tensor_key("input_key2") + +output_key1 = MessageHandler.build_tensor_key("output_key1") +output_key2 = MessageHandler.build_tensor_key("output_key2") + +output_descriptor1 = MessageHandler.build_output_tensor_descriptor( + "c", [output_key1, output_key2], "int64", [] +) +output_descriptor2 = MessageHandler.build_output_tensor_descriptor("f", [], "auto", []) +output_descriptor3 = MessageHandler.build_output_tensor_descriptor( + "c", [output_key1], "none", [1, 2, 3] +) + + +if should_run_tf: + tf_indirect_request = MessageHandler.build_request( + b"reply", + b"model", + "cpu", + [input_key1, input_key2], + [output_key1, output_key2], + [output_descriptor1, output_descriptor2, output_descriptor3], + tf_attributes, + ) + + tf_direct_request = MessageHandler.build_request( + b"reply", + b"model", + "cpu", + [tensor_3, tensor_4], + [], + [output_descriptor1, output_descriptor2], + tf_attributes, + ) + +if should_run_torch: + torch_indirect_request = MessageHandler.build_request( + b"reply", + b"model", + "cpu", + [input_key1, input_key2], + [output_key1, output_key2], + [output_descriptor1, output_descriptor2, output_descriptor3], + torch_attributes, + ) + torch_direct_request = MessageHandler.build_request( + b"reply", + b"model", + "cpu", + [tensor_1, tensor_2], + [], + [output_descriptor1, output_descriptor2], + torch_attributes, + ) + + +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") +@pytest.mark.parametrize( + "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + [ + pytest.param( + b"reply channel", + model_key, + "cpu", + [input_key1, input_key2], + [output_key1, output_key2], + [output_descriptor1], + tf_attributes, + ), + pytest.param( + b"another reply channel", + b"model data", + "gpu", + [input_key1], + [output_key2], + [output_descriptor1], + tf_attributes, + ), + pytest.param( + b"another reply channel", + b"model data", + "auto", + [input_key1], + [output_key2], + [output_descriptor1], + tf_attributes, + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [input_key1], + [output_key1], + [output_descriptor1], + None, + ), + ], +) +def test_build_request_indirect_tf_successful( + reply_channel, model, device, input, output, output_descriptors, custom_attributes +): + built_request = MessageHandler.build_request( + reply_channel, + model, + device, + input, + output, + output_descriptors, + custom_attributes, + ) + assert built_request is not None + assert built_request.replyChannel.reply == reply_channel + if built_request.model.which() == "modelKey": + assert built_request.model.modelKey.key == model.key + else: + assert built_request.model.modelData == model + assert built_request.device == device + assert built_request.input.which() == "inputKeys" + assert built_request.input.inputKeys[0].key == input[0].key + assert len(built_request.input.inputKeys) == len(input) + assert len(built_request.output) == len(output) + for i, j in zip(built_request.outputDescriptors, output_descriptors): + assert i.order == j.order + if built_request.customAttributes.which() == "tf": + assert ( + built_request.customAttributes.tf.tensorType == custom_attributes.tensorType + ) + elif built_request.customAttributes.which() == "torch": + assert ( + built_request.customAttributes.torch.tensorType + == custom_attributes.tensorType + ) + else: + assert built_request.customAttributes.none == custom_attributes + + +@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") +@pytest.mark.parametrize( + "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + [ + pytest.param( + b"reply channel", + model_key, + "cpu", + [input_key1, input_key2], + [output_key1, output_key2], + [output_descriptor1], + torch_attributes, + ), + pytest.param( + b"another reply channel", + b"model data", + "gpu", + [input_key1], + [output_key2], + [output_descriptor1], + torch_attributes, + ), + pytest.param( + b"another reply channel", + b"model data", + "auto", + [input_key1], + [output_key2], + [output_descriptor1], + torch_attributes, + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [input_key1], + [output_key1], + [output_descriptor1], + None, + ), + ], +) +def test_build_request_indirect_torch_successful( + reply_channel, model, device, input, output, output_descriptors, custom_attributes +): + built_request = MessageHandler.build_request( + reply_channel, + model, + device, + input, + output, + output_descriptors, + custom_attributes, + ) + assert built_request is not None + assert built_request.replyChannel.reply == reply_channel + if built_request.model.which() == "modelKey": + assert built_request.model.modelKey.key == model.key + else: + assert built_request.model.modelData == model + assert built_request.device == device + assert built_request.input.which() == "inputKeys" + assert built_request.input.inputKeys[0].key == input[0].key + assert len(built_request.input.inputKeys) == len(input) + assert len(built_request.output) == len(output) + for i, j in zip(built_request.outputDescriptors, output_descriptors): + assert i.order == j.order + if built_request.customAttributes.which() == "tf": + assert ( + built_request.customAttributes.tf.tensorType == custom_attributes.tensorType + ) + elif built_request.customAttributes.which() == "torch": + assert ( + built_request.customAttributes.torch.tensorType + == custom_attributes.tensorType + ) + else: + assert built_request.customAttributes.none == custom_attributes + + +@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") +@pytest.mark.parametrize( + "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + [ + pytest.param( + [], + model_key, + "cpu", + [input_key1, input_key2], + [output_key1, output_key2], + [output_descriptor1], + torch_attributes, + id="bad channel", + ), + pytest.param( + b"reply channel", + "bad model", + "gpu", + [input_key1], + [output_key2], + [output_descriptor1], + torch_attributes, + id="bad model", + ), + pytest.param( + b"reply channel", + model_key, + "bad device", + [input_key1], + [output_key2], + [output_descriptor1], + torch_attributes, + id="bad device", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + ["input_key1", "input_key2"], + [output_key1, output_key2], + [output_descriptor1], + torch_attributes, + id="bad inputs", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [model_key], + [output_key1, output_key2], + [output_descriptor1], + torch_attributes, + id="bad input schema type", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [input_key1], + ["output_key1", "output_key2"], + [output_descriptor1], + torch_attributes, + id="bad outputs", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [input_key1], + [model_key], + [output_descriptor1], + torch_attributes, + id="bad output schema type", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [input_key1], + [output_key1, output_key2], + [output_descriptor1], + "bad attributes", + id="bad custom attributes", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [input_key1], + [output_key1, output_key2], + [output_descriptor1], + model_key, + id="bad custom attributes schema type", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [input_key1], + [output_key1, output_key2], + "bad descriptors", + torch_attributes, + id="bad output descriptors", + ), + ], +) +def test_build_request_indirect_torch_unsuccessful( + reply_channel, model, device, input, output, output_descriptors, custom_attributes +): + with pytest.raises(ValueError): + built_request = MessageHandler.build_request( + reply_channel, + model, + device, + input, + output, + output_descriptors, + custom_attributes, + ) + + +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") +@pytest.mark.parametrize( + "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + [ + pytest.param( + [], + model_key, + "cpu", + [input_key1, input_key2], + [output_key1, output_key2], + [output_descriptor1], + tf_attributes, + id="bad channel", + ), + pytest.param( + b"reply channel", + "bad model", + "gpu", + [input_key1], + [output_key2], + [output_descriptor1], + tf_attributes, + id="bad model", + ), + pytest.param( + b"reply channel", + model_key, + "bad device", + [input_key1], + [output_key2], + [output_descriptor1], + tf_attributes, + id="bad device", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + ["input_key1", "input_key2"], + [output_key1, output_key2], + [output_descriptor1], + tf_attributes, + id="bad inputs", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [model_key], + [output_key1, output_key2], + [output_descriptor1], + tf_attributes, + id="bad input schema type", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [input_key1], + ["output_key1", "output_key2"], + [output_descriptor1], + tf_attributes, + id="bad outputs", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [input_key1], + [model_key], + [output_descriptor1], + tf_attributes, + id="bad output schema type", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [input_key1], + [output_key1, output_key2], + [output_descriptor1], + "bad attributes", + id="bad custom attributes", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [input_key1], + [output_key1, output_key2], + [output_descriptor1], + model_key, + id="bad custom attributes schema type", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [input_key1], + [output_key1, output_key2], + "bad descriptors", + tf_attributes, + id="bad output descriptors", + ), + ], +) +def test_build_request_indirect_tf_unsuccessful( + reply_channel, model, device, input, output, output_descriptors, custom_attributes +): + with pytest.raises(ValueError): + built_request = MessageHandler.build_request( + reply_channel, + model, + device, + input, + output, + output_descriptors, + custom_attributes, + ) + + +@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") +@pytest.mark.parametrize( + "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + [ + pytest.param( + b"reply channel", + model_key, + "cpu", + [tensor_1, tensor_2], + [], + [output_descriptor2], + torch_attributes, + ), + pytest.param( + b"another reply channel", + b"model data", + "gpu", + [tensor_1], + [], + [output_descriptor3], + torch_attributes, + ), + pytest.param( + b"another reply channel", + b"model data", + "auto", + [tensor_2], + [], + [output_descriptor1], + torch_attributes, + ), + pytest.param( + b"another reply channel", + b"model data", + "auto", + [tensor_1], + [], + [output_descriptor1], + None, + ), + ], +) +def test_build_request_direct_torch_successful( + reply_channel, model, device, input, output, output_descriptors, custom_attributes +): + built_request = MessageHandler.build_request( + reply_channel, + model, + device, + input, + output, + output_descriptors, + custom_attributes, + ) + assert built_request is not None + assert built_request.replyChannel.reply == reply_channel + if built_request.model.which() == "modelKey": + assert built_request.model.modelKey.key == model.key + else: + assert built_request.model.modelData == model + assert built_request.device == device + assert built_request.input.which() == "inputData" + assert built_request.input.inputData[0].blob == input[0].blob + assert len(built_request.input.inputData) == len(input) + assert len(built_request.output) == len(output) + for i, j in zip(built_request.outputDescriptors, output_descriptors): + assert i.order == j.order + if built_request.customAttributes.which() == "tf": + assert ( + built_request.customAttributes.tf.tensorType == custom_attributes.tensorType + ) + elif built_request.customAttributes.which() == "torch": + assert ( + built_request.customAttributes.torch.tensorType + == custom_attributes.tensorType + ) + else: + assert built_request.customAttributes.none == custom_attributes + + +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") +@pytest.mark.parametrize( + "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + [ + pytest.param( + b"reply channel", + model_key, + "cpu", + [tensor_3, tensor_4], + [], + [output_descriptor2], + tf_attributes, + ), + pytest.param( + b"another reply channel", + b"model data", + "gpu", + [tensor_4], + [], + [output_descriptor3], + tf_attributes, + ), + pytest.param( + b"another reply channel", + b"model data", + "auto", + [tensor_4], + [], + [output_descriptor1], + tf_attributes, + ), + pytest.param( + b"another reply channel", + b"model data", + "auto", + [tensor_3], + [], + [output_descriptor1], + None, + ), + ], +) +def test_build_request_direct_tf_successful( + reply_channel, model, device, input, output, output_descriptors, custom_attributes +): + built_request = MessageHandler.build_request( + reply_channel, + model, + device, + input, + output, + output_descriptors, + custom_attributes, + ) + assert built_request is not None + assert built_request.replyChannel.reply == reply_channel + if built_request.model.which() == "modelKey": + assert built_request.model.modelKey.key == model.key + else: + assert built_request.model.modelData == model + assert built_request.device == device + assert built_request.input.which() == "inputData" + assert built_request.input.inputData[0].blob == input[0].blob + assert len(built_request.input.inputData) == len(input) + assert len(built_request.output) == len(output) + for i, j in zip(built_request.outputDescriptors, output_descriptors): + assert i.order == j.order + if built_request.customAttributes.which() == "tf": + assert ( + built_request.customAttributes.tf.tensorType == custom_attributes.tensorType + ) + elif built_request.customAttributes.which() == "torch": + assert ( + built_request.customAttributes.torch.tensorType + == custom_attributes.tensorType + ) + else: + assert built_request.customAttributes.none == custom_attributes + + +@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") +@pytest.mark.parametrize( + "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + [ + pytest.param( + [], + model_key, + "cpu", + [tensor_1, tensor_2], + [], + [output_descriptor2], + torch_attributes, + id="bad channel", + ), + pytest.param( + b"reply channel", + "bad model", + "gpu", + [tensor_1], + [], + [output_descriptor2], + torch_attributes, + id="bad model", + ), + pytest.param( + b"reply channel", + model_key, + "bad device", + [tensor_2], + [], + [output_descriptor2], + torch_attributes, + id="bad device", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + ["input_key1", "input_key2"], + [], + [output_descriptor2], + torch_attributes, + id="bad inputs", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [], + ["output_key1", "output_key2"], + [output_descriptor2], + torch_attributes, + id="bad outputs", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [tensor_1], + [], + [output_descriptor2], + "bad attributes", + id="bad custom attributes", + ), + pytest.param( + b"reply_channel", + model_key, + "cpu", + [tensor_1, tensor_2], + [], + ["output_descriptor2"], + torch_attributes, + id="bad output descriptors", + ), + ], +) +def test_build_torch_request_direct_unsuccessful( + reply_channel, model, device, input, output, output_descriptors, custom_attributes +): + with pytest.raises(ValueError): + built_request = MessageHandler.build_request( + reply_channel, + model, + device, + input, + output, + output_descriptors, + custom_attributes, + ) + + +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") +@pytest.mark.parametrize( + "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + [ + pytest.param( + [], + model_key, + "cpu", + [tensor_3, tensor_4], + [], + [output_descriptor2], + tf_attributes, + id="bad channel", + ), + pytest.param( + b"reply channel", + "bad model", + "gpu", + [tensor_4], + [], + [output_descriptor2], + tf_attributes, + id="bad model", + ), + pytest.param( + b"reply channel", + model_key, + "bad device", + [tensor_3], + [], + [output_descriptor2], + tf_attributes, + id="bad device", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + ["input_key1", "input_key2"], + [], + [output_descriptor2], + tf_attributes, + id="bad inputs", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [], + ["output_key1", "output_key2"], + [output_descriptor2], + tf_attributes, + id="bad outputs", + ), + pytest.param( + b"reply channel", + model_key, + "cpu", + [tensor_4], + [], + [output_descriptor2], + "bad attributes", + id="bad custom attributes", + ), + pytest.param( + b"reply_channel", + model_key, + "cpu", + [tensor_3, tensor_4], + [], + ["output_descriptor2"], + tf_attributes, + id="bad output descriptors", + ), + ], +) +def test_build_tf_request_direct_unsuccessful( + reply_channel, model, device, input, output, output_descriptors, custom_attributes +): + with pytest.raises(ValueError): + built_request = MessageHandler.build_request( + reply_channel, + model, + device, + input, + output, + output_descriptors, + custom_attributes, + ) + + +@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") +@pytest.mark.parametrize( + "req", + [ + pytest.param(torch_indirect_request, id="indirect"), + pytest.param(torch_direct_request, id="direct"), + ], +) +def test_serialize_torch_request_successful(req): + serialized = MessageHandler.serialize_request(req) + assert type(serialized) == bytes + + deserialized = MessageHandler.deserialize_request(serialized) + assert deserialized.to_dict() == req.to_dict() + + +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") +@pytest.mark.parametrize( + "req", + [ + pytest.param(tf_indirect_request, id="indirect"), + pytest.param(tf_direct_request, id="direct"), + ], +) +def test_serialize_tf_request_successful(req): + serialized = MessageHandler.serialize_request(req) + assert type(serialized) == bytes + + deserialized = MessageHandler.deserialize_request(serialized) + assert deserialized.to_dict() == req.to_dict() diff --git a/tests/test_message_handler/test_response.py b/tests/test_message_handler/test_response.py new file mode 100644 index 0000000000..9d59a18793 --- /dev/null +++ b/tests/test_message_handler/test_response.py @@ -0,0 +1,341 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from smartsim._core.mli.message_handler import MessageHandler + +try: + import tensorflow as tf +except ImportError: + should_run_tf = False +else: + should_run_tf = True + + tflow1 = tf.zeros((3, 2, 5), dtype=tf.int8) + tflow2 = tf.ones((1040, 1040, 3), dtype=tf.int64) + + small_tf_tensor = MessageHandler.build_tensor( + tflow1.numpy(), "c", "int8", list(tflow1.shape) + ) + medium_tf_tensor = MessageHandler.build_tensor( + tflow2.numpy(), "c", "int64", list(tflow2.shape) + ) + + tf_attributes = MessageHandler.build_tf_response_attributes() + + tf_direct_response = MessageHandler.build_response( + "complete", + "Success again!", + [small_tf_tensor, medium_tf_tensor], + tf_attributes, + ) + + +try: + import torch +except ImportError: + should_run_torch = False +else: + should_run_torch = True + + torch1 = torch.zeros((3, 2, 5), dtype=torch.int8) + torch2 = torch.ones((1040, 1040, 3), dtype=torch.int64) + + small_torch_tensor = MessageHandler.build_tensor( + torch1.numpy(), "c", "int8", list(torch1.shape) + ) + medium_torch_tensor = MessageHandler.build_tensor( + torch2.numpy(), "c", "int64", list(torch2.shape) + ) + + torch_attributes = MessageHandler.build_torch_response_attributes() + + torch_direct_response = MessageHandler.build_response( + "complete", + "Success again!", + [small_torch_tensor, medium_torch_tensor], + torch_attributes, + ) + + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +result_key1 = MessageHandler.build_tensor_key("result_key1") +result_key2 = MessageHandler.build_tensor_key("result_key2") + + +if should_run_tf: + tf_indirect_response = MessageHandler.build_response( + "complete", + "Success!", + [result_key1, result_key2], + tf_attributes, + ) + +if should_run_torch: + torch_indirect_response = MessageHandler.build_response( + "complete", + "Success!", + [result_key1, result_key2], + torch_attributes, + ) + + +@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") +@pytest.mark.parametrize( + "status, status_message, result, custom_attribute", + [ + pytest.param( + 200, + "Yay, it worked!", + [small_torch_tensor, medium_torch_tensor], + None, + id="tensor list", + ), + pytest.param( + 200, + "Yay, it worked!", + [small_torch_tensor], + torch_attributes, + id="small tensor", + ), + pytest.param( + 200, + "Yay, it worked!", + [result_key1, result_key2], + torch_attributes, + id="tensor key list", + ), + ], +) +def test_build_torch_response_successful( + status, status_message, result, custom_attribute +): + response = MessageHandler.build_response( + status=status, + message=status_message, + result=result, + custom_attributes=custom_attribute, + ) + assert response is not None + assert response.status == status + assert response.message == status_message + if response.result.which() == "keys": + assert response.result.keys[0].to_dict() == result[0].to_dict() + else: + assert response.result.data[0].to_dict() == result[0].to_dict() + + +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") +@pytest.mark.parametrize( + "status, status_message, result, custom_attribute", + [ + pytest.param( + 200, + "Yay, it worked!", + [small_tf_tensor, medium_tf_tensor], + None, + id="tensor list", + ), + pytest.param( + 200, + "Yay, it worked!", + [small_tf_tensor], + tf_attributes, + id="small tensor", + ), + pytest.param( + 200, + "Yay, it worked!", + [result_key1, result_key2], + tf_attributes, + id="tensor key list", + ), + ], +) +def test_build_tf_response_successful(status, status_message, result, custom_attribute): + response = MessageHandler.build_response( + status=status, + message=status_message, + result=result, + custom_attributes=custom_attribute, + ) + assert response is not None + assert response.status == status + assert response.message == status_message + if response.result.which() == "keys": + assert response.result.keys[0].to_dict() == result[0].to_dict() + else: + assert response.result.data[0].to_dict() == result[0].to_dict() + + +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") +@pytest.mark.parametrize( + "status, status_message, result, custom_attribute", + [ + pytest.param( + "bad status", + "Yay, it worked!", + [small_tf_tensor, medium_tf_tensor], + None, + id="bad status", + ), + pytest.param( + "complete", + 200, + [small_tf_tensor], + tf_attributes, + id="bad status message", + ), + pytest.param( + "complete", + "Yay, it worked!", + ["result_key1", "result_key2"], + tf_attributes, + id="bad result", + ), + pytest.param( + "complete", + "Yay, it worked!", + [tf_attributes], + tf_attributes, + id="bad result type", + ), + pytest.param( + "complete", + "Yay, it worked!", + [small_tf_tensor, medium_tf_tensor], + "custom attributes", + id="bad custom attributes", + ), + pytest.param( + "complete", + "Yay, it worked!", + [small_tf_tensor, medium_tf_tensor], + result_key1, + id="bad custom attributes type", + ), + ], +) +def test_build_tf_response_unsuccessful( + status, status_message, result, custom_attribute +): + with pytest.raises(ValueError): + response = MessageHandler.build_response( + status, status_message, result, custom_attribute + ) + + +@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") +@pytest.mark.parametrize( + "status, status_message, result, custom_attribute", + [ + pytest.param( + "bad status", + "Yay, it worked!", + [small_torch_tensor, medium_torch_tensor], + None, + id="bad status", + ), + pytest.param( + "complete", + 200, + [small_torch_tensor], + torch_attributes, + id="bad status message", + ), + pytest.param( + "complete", + "Yay, it worked!", + ["result_key1", "result_key2"], + torch_attributes, + id="bad result", + ), + pytest.param( + "complete", + "Yay, it worked!", + [torch_attributes], + torch_attributes, + id="bad result type", + ), + pytest.param( + "complete", + "Yay, it worked!", + [small_torch_tensor, medium_torch_tensor], + "custom attributes", + id="bad custom attributes", + ), + pytest.param( + "complete", + "Yay, it worked!", + [small_torch_tensor, medium_torch_tensor], + result_key1, + id="bad custom attributes type", + ), + ], +) +def test_build_torch_response_unsuccessful( + status, status_message, result, custom_attribute +): + with pytest.raises(ValueError): + response = MessageHandler.build_response( + status, status_message, result, custom_attribute + ) + + +@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") +@pytest.mark.parametrize( + "response", + [ + pytest.param(torch_indirect_response, id="indirect"), + pytest.param(torch_direct_response, id="direct"), + ], +) +def test_torch_serialize_response(response): + serialized = MessageHandler.serialize_response(response) + assert type(serialized) == bytes + + deserialized = MessageHandler.deserialize_response(serialized) + assert deserialized.to_dict() == response.to_dict() + + +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") +@pytest.mark.parametrize( + "response", + [ + pytest.param(tf_indirect_response, id="indirect"), + pytest.param(tf_direct_response, id="direct"), + ], +) +def test_tf_serialize_response(response): + serialized = MessageHandler.serialize_response(response) + assert type(serialized) == bytes + + deserialized = MessageHandler.deserialize_response(serialized) + assert deserialized.to_dict() == response.to_dict() From 38081dabeddf7e13c5ad9bad53a24e925e6b7728 Mon Sep 17 00:00:00 2001 From: Chris McBride <3595025+ankona@users.noreply.github.com> Date: Thu, 20 Jun 2024 19:45:14 -0400 Subject: [PATCH 02/60] ML Worker Manager MVP (#608) This PR contains an ML worker manager MVP. The worker manager executes a single-threaded version of the planned ML pipeline for a single worker instance. [ committed by @ankona ] [ approved by @mellis13 ] --- doc/changelog.md | 1 + smartsim/_core/entrypoints/service.py | 135 ++++++++ smartsim/_core/mli/comm/channel/__init__.py | 0 smartsim/_core/mli/comm/channel/channel.py | 52 +++ .../_core/mli/comm/channel/dragonchannel.py | 52 +++ smartsim/_core/mli/infrastructure/__init__.py | 0 .../infrastructure/control/workermanager.py | 294 ++++++++++++++++ .../mli/infrastructure/storage/__init__.py | 0 .../storage/dragonfeaturestore.py | 70 ++++ .../infrastructure/storage/featurestore.py | 49 +++ .../mli/infrastructure/worker/__init__.py | 0 .../_core/mli/infrastructure/worker/worker.py | 313 +++++++++++++++++ tests/mli/__init__.py | 0 tests/mli/channel.py | 59 ++++ tests/mli/featurestore.py | 128 +++++++ .../mli/test_core_machine_learning_worker.py | 321 ++++++++++++++++++ tests/mli/test_default_torch_worker.py | 206 +++++++++++ tests/mli/test_integrated_torch_worker.py | 290 ++++++++++++++++ tests/mli/test_service.py | 205 +++++++++++ tests/mli/test_worker_manager.py | 196 +++++++++++ tests/mli/worker.py | 128 +++++++ 21 files changed, 2499 insertions(+) create mode 100644 smartsim/_core/entrypoints/service.py create mode 100644 smartsim/_core/mli/comm/channel/__init__.py create mode 100644 smartsim/_core/mli/comm/channel/channel.py create mode 100644 smartsim/_core/mli/comm/channel/dragonchannel.py create mode 100644 smartsim/_core/mli/infrastructure/__init__.py create mode 100644 smartsim/_core/mli/infrastructure/control/workermanager.py create mode 100644 smartsim/_core/mli/infrastructure/storage/__init__.py create mode 100644 smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py create mode 100644 smartsim/_core/mli/infrastructure/storage/featurestore.py create mode 100644 smartsim/_core/mli/infrastructure/worker/__init__.py create mode 100644 smartsim/_core/mli/infrastructure/worker/worker.py create mode 100644 tests/mli/__init__.py create mode 100644 tests/mli/channel.py create mode 100644 tests/mli/featurestore.py create mode 100644 tests/mli/test_core_machine_learning_worker.py create mode 100644 tests/mli/test_default_torch_worker.py create mode 100644 tests/mli/test_integrated_torch_worker.py create mode 100644 tests/mli/test_service.py create mode 100644 tests/mli/test_worker_manager.py create mode 100644 tests/mli/worker.py diff --git a/doc/changelog.md b/doc/changelog.md index f8f712e069..e38d234965 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Add ML worker manager, sample worker, and feature store - Added schemas and MessageHandler class for de/serialization of inference requests and response messages diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py new file mode 100644 index 0000000000..e03df6bea1 --- /dev/null +++ b/smartsim/_core/entrypoints/service.py @@ -0,0 +1,135 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import datetime +import time +import typing as t +from abc import ABC, abstractmethod + +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class Service(ABC): + """Base contract for standalone entrypoint scripts. Defines API for entrypoint + behaviors (event loop, automatic shutdown, cooldown) as well as simple + hooks for status changes""" + + def __init__( + self, as_service: bool = False, cooldown: int = 0, loop_delay: int = 0 + ) -> None: + """Initialize the ServiceHost + :param as_service: Determines if the host will run until shutdown criteria + are met or as a run-once instance + :param cooldown: Period of time to allow service to run before automatic + shutdown, in seconds. A non-zero, positive integer.""" + self._as_service = as_service + """If the service should run until shutdown function returns True""" + self._cooldown = abs(cooldown) + """Duration of a cooldown period between requests to the service + before shutdown""" + self._loop_delay = abs(loop_delay) + """Forced delay between iterations of the event loop""" + + @abstractmethod + def _on_iteration(self) -> None: + """The user-defined event handler. Executed repeatedly until shutdown + conditions are satisfied and cooldown is elapsed. + """ + + @abstractmethod + def _can_shutdown(self) -> bool: + """Return true when the criteria to shut down the service are met.""" + + def _on_start(self) -> None: + """Empty hook method for use by subclasses. Called on initial entry into + ServiceHost `execute` event loop before `_on_iteration` is invoked.""" + logger.debug(f"Starting {self.__class__.__name__}") + + def _on_shutdown(self) -> None: + """Empty hook method for use by subclasses. Called immediately after exiting + the main event loop during automatic shutdown.""" + logger.debug(f"Shutting down {self.__class__.__name__}") + + def _on_cooldown_elapsed(self) -> None: + """Empty hook method for use by subclasses. Called on every event loop + iteration immediately upon exceeding the cooldown period""" + logger.debug(f"Cooldown exceeded by {self.__class__.__name__}") + + def _on_delay(self) -> None: + """Empty hook method for use by subclasses. Called on every event loop + iteration immediately before executing a delay before the next iteration""" + logger.debug(f"Service iteration waiting for {self.__class__.__name__}s") + + def _log_cooldown(self, elapsed: float) -> None: + """Log the remaining cooldown time, if any""" + remaining = self._cooldown - elapsed + if remaining > 0: + logger.debug(f"{abs(remaining):.2f}s remains of {self._cooldown}s cooldown") + else: + logger.info(f"exceeded cooldown {self._cooldown}s by {abs(remaining):.2f}s") + + def execute(self) -> None: + """The main event loop of a service host. Evaluates shutdown criteria and + combines with a cooldown period to allow automatic service termination. + Responsible for executing calls to subclass implementation of `_on_iteration`""" + self._on_start() + + running = True + cooldown_start: t.Optional[datetime.datetime] = None + + while running: + self._on_iteration() + + # allow immediate shutdown if not set to run as a service + if not self._as_service: + running = False + continue + + # reset cooldown period if shutdown criteria are not met + if not self._can_shutdown(): + cooldown_start = None + + # start tracking cooldown elapsed once eligible to quit + if cooldown_start is None: + cooldown_start = datetime.datetime.now() + + # change running state if cooldown period is exceeded + if self._cooldown > 0: + elapsed = datetime.datetime.now() - cooldown_start + running = elapsed.total_seconds() < self._cooldown + self._log_cooldown(elapsed.total_seconds()) + if not running: + self._on_cooldown_elapsed() + elif self._cooldown < 1 and self._can_shutdown(): + running = False + + if self._loop_delay: + self._on_delay() + time.sleep(self._loop_delay) + + self._on_shutdown() diff --git a/smartsim/_core/mli/comm/channel/__init__.py b/smartsim/_core/mli/comm/channel/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py new file mode 100644 index 0000000000..201ab9deab --- /dev/null +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -0,0 +1,52 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t +from abc import ABC, abstractmethod + +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class CommChannelBase(ABC): + """Base class for abstracting a message passing mechanism""" + + def __init__(self, descriptor: t.Union[str, bytes]) -> None: + """Initialize the CommChannel instance""" + self._descriptor = descriptor + + @abstractmethod + def send(self, value: bytes) -> None: + """Send a message throuh the underlying communication channel + :param value: The value to send""" + + @property + def descriptor(self) -> bytes: + """Return the channel descriptor for the underlying dragon channel""" + if isinstance(self._descriptor, str): + return self._descriptor.encode("utf-8") + return self._descriptor diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py new file mode 100644 index 0000000000..4fd26861ca --- /dev/null +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -0,0 +1,52 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + +import smartsim._core.mli.comm.channel.channel as cch +from smartsim.log import get_logger + +logger = get_logger(__name__) + +if t.TYPE_CHECKING: + import dragon.channels as dch + import dragon.utils as du + + +class DragonCommChannel(cch.CommChannelBase): + """Passes messages by writing to a Dragon channel""" + + def __init__(self, key: bytes) -> None: + """Initialize the DragonCommChannel instance""" + super().__init__(key) + # todo: do we need memory pool information to construct the channel correctly? + self._channel: "dch.Channel" = du.get_channel(key) + + def send(self, value: bytes) -> None: + """Send a message throuh the underlying communication channel + :param value: The value to send""" + logger.debug(f"Channel {self.descriptor.decode('utf-8')} sending message") + self._channel.send_bytes(value) diff --git a/smartsim/_core/mli/infrastructure/__init__.py b/smartsim/_core/mli/infrastructure/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py new file mode 100644 index 0000000000..b3b79f7f30 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -0,0 +1,294 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import multiprocessing as mp +import typing as t + +import numpy as np + +from smartsim._core.entrypoints.service import Service +from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.infrastructure.worker.worker import ( + InferenceReply, + InferenceRequest, + MachineLearningWorkerBase, +) +from smartsim._core.mli.message_handler import MessageHandler +from smartsim._core.mli.mli_schemas.response.response_capnp import Response +from smartsim.log import get_logger + +if t.TYPE_CHECKING: + from smartsim._core.mli.mli_schemas.response.response_capnp import StatusEnum + +logger = get_logger(__name__) + + +def deserialize_message( + data_blob: bytes, channel_type: t.Type[CommChannelBase] +) -> InferenceRequest: + """Deserialize a message from a byte stream into an InferenceRequest + :param data_blob: The byte stream to deserialize""" + # todo: consider moving to XxxCore and only making + # workers implement the inputs and model conversion? + + # alternatively, consider passing the capnproto models + # to this method instead of the data_blob... + + # something is definitely wrong here... client shouldn't have to touch + # callback (or batch size) + + request = MessageHandler.deserialize_request(data_blob) + # return request + device = request.device + model_key: t.Optional[str] = None + model_bytes: t.Optional[bytes] = None + + if request.model.which() == "modelKey": + model_key = request.model.modelKey.key + elif request.model.which() == "modelData": + model_bytes = request.model.modelData + + callback_key = request.replyChannel.reply + + # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel` + comm_channel = channel_type(callback_key) + # comm_channel = DragonCommChannel(request.replyChannel) + + input_keys: t.Optional[t.List[str]] = None + input_bytes: t.Optional[t.List[bytes]] = ( + None # these will really be tensors already + ) + + # # client example + # msg = Message() + # t = torch.Tensor() + # msg.inputs = [custom_byte_converter(t)] + # mli_client.request_inference(msg) + # # end client + input_meta: t.List[t.Any] = [] + + if request.input.which() == "inputKeys": + input_keys = [input_key.key for input_key in request.input.inputKeys] + elif request.input.which() == "inputData": + input_bytes = [data.blob for data in request.input.inputData] + input_meta = [data.tensorDescriptor for data in request.input.inputData] + + inference_request = InferenceRequest( + model_key=model_key, + callback=comm_channel, + raw_inputs=input_bytes, + input_meta=input_meta, + input_keys=input_keys, + raw_model=model_bytes, + batch_size=0, + device=device, + ) + return inference_request + + +def build_failure_reply(status: "StatusEnum", message: str) -> Response: + return MessageHandler.build_response( + status=status, # todo: need to indicate correct status + message=message, # todo: decide what these will be + result=[], + custom_attributes=None, + ) + + +def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: + prepared_outputs: t.List[t.Any] = [] + if reply.output_keys: + for key in reply.output_keys: + if not key: + continue + msg_key = MessageHandler.build_tensor_key(key) + prepared_outputs.append(msg_key) + elif reply.outputs: + arrays: t.List[np.ndarray[t.Any, np.dtype[t.Any]]] = [ + output.numpy() for output in reply.outputs + ] + for tensor in arrays: + # todo: need to have the output attributes specified in the req? + # maybe, add `MessageHandler.dtype_of(tensor)`? + # can `build_tensor` do dtype and shape? + msg_tensor = MessageHandler.build_tensor( + tensor, + "c", + "float32", + [1], + ) + prepared_outputs.append(msg_tensor) + return prepared_outputs + + +def build_reply(reply: InferenceReply) -> Response: + results = prepare_outputs(reply) + + return MessageHandler.build_response( + status="complete", + message="success", + result=results, + custom_attributes=None, + ) + + +class WorkerManager(Service): + """An implementation of a service managing distribution of tasks to + machine learning workers""" + + def __init__( + self, + task_queue: "mp.Queue[bytes]", + worker: MachineLearningWorkerBase, + feature_store: t.Optional[FeatureStore] = None, + as_service: bool = False, + cooldown: int = 0, + comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, + ) -> None: + """Initialize the WorkerManager + :param task_queue: The queue to monitor for new tasks + :param workers: A worker to manage + :param feature_store: The persistence mechanism + :param as_service: Specifies run-once or run-until-complete behavior of service + :param cooldown: Number of seconds to wait before shutting down afer + shutdown criteria are met + :param comm_channel_type: The type of communication channel used for callbacks + """ + super().__init__(as_service, cooldown) + + """a collection of workers the manager is controlling""" + self._task_queue: "mp.Queue[bytes]" = task_queue + """the queue the manager monitors for new tasks""" + self._feature_store: t.Optional[FeatureStore] = feature_store + """a feature store to retrieve models from""" + self._worker = worker + """The ML Worker implementation""" + self._comm_channel_type = comm_channel_type + """The type of communication channel to construct for callbacks""" + + def _validate_request(self, request: InferenceRequest) -> bool: + """Ensure the request can be processed. + :param request: The request to validate + :return: True if the request is valid, False otherwise""" + if not self._feature_store: + if request.model_key: + logger.error("Unable to load model by key without feature store") + return False + + if request.input_keys: + logger.error("Unable to load inputs by key without feature store") + return False + + if request.output_keys: + logger.error("Unable to persist outputs by key without feature store") + return False + + if not request.model_key and not request.raw_model: + logger.error("Unable to continue without model bytes or feature store key") + return False + + if not request.input_keys and not request.raw_inputs: + logger.error("Unable to continue without input bytes or feature store keys") + return False + + if request.callback is None: + logger.error("No callback channel provided in request") + return False + + return True + + def _on_iteration(self) -> None: + """Executes calls to the machine learning worker implementation to complete + the inference pipeline""" + logger.debug("executing worker manager pipeline") + + if self._task_queue is None: + logger.warning("No queue to check for tasks") + return + + # perform default deserialization of the message envelope + request_bytes: bytes = self._task_queue.get() + + request = deserialize_message(request_bytes, self._comm_channel_type) + if not self._validate_request(request): + return + + # # let the worker perform additional custom deserialization + # request = self._worker.deserialize(request_bytes) + + fetch_model_result = self._worker.fetch_model(request, self._feature_store) + model_result = self._worker.load_model(request, fetch_model_result) + fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) + transformed_input = self._worker.transform_input(request, fetch_input_result) + + # batch: t.Collection[_Datum] = transform_result.transformed_input + # if self._batch_size: + # batch = self._worker.batch_requests(transform_result, self._batch_size) + + reply = InferenceReply() + + try: + execute_result = self._worker.execute( + request, model_result, transformed_input + ) + + transformed_output = self._worker.transform_output(request, execute_result) + + if request.output_keys: + reply.output_keys = self._worker.place_output( + request, transformed_output, self._feature_store + ) + else: + reply.outputs = transformed_output.outputs + except Exception: + logger.exception("Error executing worker") + reply.failed = True + + if reply.failed: + response = build_failure_reply("fail", "failure-occurred") + else: + if reply.outputs is None or not reply.outputs: + response = build_failure_reply("fail", "no-results") + + response = build_reply(reply) + + # serialized = self._worker.serialize_reply(request, transformed_output) + serialized_resp = MessageHandler.serialize_response(response) # type: ignore + if request.callback: + request.callback.send(serialized_resp) + + def _can_shutdown(self) -> bool: + """Return true when the criteria to shut down the service are met.""" + # todo: determine shutdown criteria + # will we receive a completion message? + # will we let MLI mgr just kill this? + # time_diff = self._last_event - datetime.datetime.now() + # if time_diff.total_seconds() > self._cooldown: + # return True + # return False + return self._worker is None diff --git a/smartsim/_core/mli/infrastructure/storage/__init__.py b/smartsim/_core/mli/infrastructure/storage/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py new file mode 100644 index 0000000000..ea8f06977d --- /dev/null +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -0,0 +1,70 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + +import smartsim.error as sse +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim.log import get_logger + +if t.TYPE_CHECKING: + from dragon.data.distdictionary.dragon_dict import DragonDict + + +logger = get_logger(__name__) + + +class DragonFeatureStore(FeatureStore): + """A feature store backed by a dragon distributed dictionary""" + + def __init__(self, storage: "DragonDict") -> None: + """Initialize the DragonFeatureStore instance""" + self._storage = storage + + def __getitem__(self, key: str) -> t.Any: + """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" + key_ = key.encode("utf-8") + try: + return self._storage[key_] + except Exception as ex: + # note: explicitly avoid round-trip to check for key existence + raise sse.SmartSimError(f"{key} not found in feature store") from ex + + def __setitem__(self, key: str, value: bytes) -> None: + """Assign a value using key + :param key: Unique key of an item to set in the feature store + :param value: Value to persist in the feature store""" + key_ = key.encode("utf-8") + self._storage[key_] = value + + def __contains__(self, key: t.Union[str, bytes]) -> bool: + """Membership operator to test for a key existing within the feature store. + Return `True` if the key is found, `False` otherwise + :param key: Unique key of an item to retrieve from the feature store""" + if isinstance(key, str): + key = key.encode("utf-8") + return key in self._storage diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py new file mode 100644 index 0000000000..ec4086b732 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py @@ -0,0 +1,49 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from abc import ABC, abstractmethod + + +class FeatureStore(ABC): + """Abstract base class providing the common interface for retrieving + values from a feature store implementation""" + + @abstractmethod + def __getitem__(self, key: str) -> bytes: + """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" + + @abstractmethod + def __setitem__(self, key: str, value: bytes) -> None: + """Assign a value using key + :param key: Unique key of an item to set in the feature store + :param value: Value to persist in the feature store""" + + @abstractmethod + def __contains__(self, key: str) -> bool: + """Membership operator to test for a key existing within the feature store. + Return `True` if the key is found, `False` otherwise + :param key: Unique key of an item to retrieve from the feature store""" diff --git a/smartsim/_core/mli/infrastructure/worker/__init__.py b/smartsim/_core/mli/infrastructure/worker/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py new file mode 100644 index 0000000000..99b51e178d --- /dev/null +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -0,0 +1,313 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t +from abc import ABC, abstractmethod + +import smartsim.error as sse +from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class InferenceRequest: + """Internal representation of an inference request from a client""" + + def __init__( + self, + model_key: t.Optional[str] = None, + callback: t.Optional[CommChannelBase] = None, + raw_inputs: t.Optional[t.List[bytes]] = None, + # todo: copying byte array is likely to create a copy of the data in + # capnproto and will be a performance issue later + input_keys: t.Optional[t.List[str]] = None, + input_meta: t.Optional[t.List[t.Any]] = None, + output_keys: t.Optional[t.List[str]] = None, + raw_model: t.Optional[bytes] = None, + batch_size: int = 0, + device: t.Optional[str] = None, + ): + """Initialize the object""" + self.model_key = model_key + self.raw_model = raw_model + self.callback = callback + self.raw_inputs = raw_inputs + self.input_keys = input_keys or [] + self.input_meta = input_meta or [] + self.output_keys = output_keys or [] + self.batch_size = batch_size + self.device = device + + +class InferenceReply: + """Internal representation of the reply to a client request for inference""" + + def __init__( + self, + outputs: t.Optional[t.Collection[t.Any]] = None, + output_keys: t.Optional[t.Collection[str]] = None, + failed: bool = False, + ) -> None: + """Initialize the object""" + self.outputs: t.Collection[t.Any] = outputs or [] + self.output_keys: t.Collection[t.Optional[str]] = output_keys or [] + self.failed = failed + + +class LoadModelResult: + """A wrapper around a loaded model""" + + def __init__(self, model: t.Any) -> None: + """Initialize the object""" + self.model = model + + +class TransformInputResult: + """A wrapper around a transformed input""" + + def __init__(self, result: t.Any) -> None: + """Initialize the object""" + self.transformed = result + + +class ExecuteResult: + """A wrapper around inference results""" + + def __init__(self, result: t.Any) -> None: + """Initialize the object""" + self.predictions = result + + +class FetchInputResult: + """A wrapper around fetched inputs""" + + def __init__(self, result: t.List[bytes]) -> None: + """Initialize the object""" + self.inputs = result + + +class TransformOutputResult: + """A wrapper around inference results transformed for transmission""" + + def __init__( + self, result: t.Any, shape: t.List[int], order: str, dtype: str + ) -> None: + """Initialize the OutputTransformResult""" + self.outputs = result + self.shape = shape + self.order = order + self.dtype = dtype + # todo: determine if each output must have an individual (shape, order, dtype) + + +class CreateInputBatchResult: + """A wrapper around inputs batched into a single request""" + + def __init__(self, result: t.Any) -> None: + """Initialize the object""" + self.batch = result + + +class FetchModelResult: + """A wrapper around raw fetched models""" + + def __init__(self, result: bytes) -> None: + """Initialize the object""" + self.model_bytes = result + + +class MachineLearningWorkerCore: + """Basic functionality of ML worker that is shared across all worker types""" + + @staticmethod + def fetch_model( + request: InferenceRequest, feature_store: t.Optional[FeatureStore] + ) -> FetchModelResult: + """Given a resource key, retrieve the raw model from a feature store + :param request: The request that triggered the pipeline + :param feature_store: The feature store used for persistence + :return: Raw bytes of the model""" + if not feature_store: + raise ValueError("Feature store is required for model retrieval") + + if request.raw_model: + # Should we cache model in the feature store? + # model_key = hash(request.raw_model) + # feature_store[model_key] = request.raw_model + # short-circuit and return the directly supplied model + return FetchModelResult(request.raw_model) + + if not request.model_key: + raise sse.SmartSimError( + "Key must be provided to retrieve model from feature store" + ) + + try: + raw_bytes = feature_store[request.model_key] + return FetchModelResult(raw_bytes) + except FileNotFoundError as ex: + logger.exception(ex) + raise sse.SmartSimError( + f"Model could not be retrieved with key {request.model_key}" + ) from ex + + @staticmethod + def fetch_inputs( + request: InferenceRequest, feature_store: t.Optional[FeatureStore] + ) -> FetchInputResult: + """Given a collection of ResourceKeys, identify the physical location + and input metadata + :param request: The request that triggered the pipeline + :param feature_store: The feature store used for persistence + :return: the fetched input""" + if not feature_store: + raise ValueError("Feature store is required for input retrieval") + + if request.input_keys: + data: t.List[bytes] = [] + for input_ in request.input_keys: + try: + tensor_bytes = feature_store[input_] + data.append(tensor_bytes) + except KeyError as ex: + logger.exception(ex) + raise sse.SmartSimError( + f"Model could not be retrieved with key {input_}" + ) from ex + return FetchInputResult(data) + + if request.raw_inputs: + return FetchInputResult(request.raw_inputs) + + raise ValueError("No input source") + + @staticmethod + def batch_requests( + request: InferenceRequest, transform_result: TransformInputResult + ) -> CreateInputBatchResult: + """Create a batch of requests. Return the batch when batch_size datum have been + collected or a configured batch duration has elapsed. + :param request: The request that triggered the pipeline + :param transform_result: Transformed inputs ready for batching + :return: `None` if batch size has not been reached and timeout not exceeded.""" + if transform_result is not None or request.batch_size: + raise NotImplementedError("Batching is not yet supported") + return CreateInputBatchResult(None) + + @staticmethod + def place_output( + request: InferenceRequest, + transform_result: TransformOutputResult, + feature_store: t.Optional[FeatureStore], + ) -> t.Collection[t.Optional[str]]: + """Given a collection of data, make it available as a shared resource in the + feature store + :param request: The request that triggered the pipeline + :param execute_result: Results from inference + :param feature_store: The feature store used for persistence + :return: A collection of keys that were placed in the feature store""" + if not feature_store: + raise ValueError("Feature store is required for output persistence") + + keys: t.List[t.Optional[str]] = [] + # need to decide how to get back to original sub-batch inputs so they can be + # accurately placed, datum might need to include this. + + # Consider parallelizing all PUT feature_store operations + for k, v in zip(request.output_keys, transform_result.outputs): + feature_store[k] = v + keys.append(k) + + return keys + + +class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC): + """Abstrct base class providing contract for a machine learning + worker implementation.""" + + # @staticmethod + # @abstractmethod + # def deserialize(request: InferenceRequest) -> InferenceRequest: + # """Given a collection of data serialized to bytes, convert the bytes + # to a proper representation used by the ML backend + # :param data_blob: inference request as a byte-serialized blob + # :return: InferenceRequest deserialized from the input""" + + @staticmethod + @abstractmethod + def load_model( + request: InferenceRequest, fetch_result: FetchModelResult + ) -> LoadModelResult: + """Given a loaded MachineLearningModel, ensure it is loaded into + device memory + :param request: The request that triggered the pipeline + :return: ModelLoadResult wrapping the model loaded for the request""" + + @staticmethod + @abstractmethod + def transform_input( + request: InferenceRequest, fetch_result: FetchInputResult + ) -> TransformInputResult: + """Given a collection of data, perform a transformation on the data + :param request: The request that triggered the pipeline + :param fetch_result: Raw output from fetching inputs out of a feature store + :return: The transformed inputs wrapped in a InputTransformResult""" + + @staticmethod + @abstractmethod + def execute( + request: InferenceRequest, + load_result: LoadModelResult, + transform_result: TransformInputResult, + ) -> ExecuteResult: + """Execute an ML model on inputs transformed for use by the model + :param request: The request that triggered the pipeline + :param load_result: The result of loading the model onto device memory + :param transform_result: The result of transforming inputs for model consumption + :return: The result of inference wrapped in an ExecuteResult""" + + @staticmethod + @abstractmethod + def transform_output( + request: InferenceRequest, + execute_result: ExecuteResult, + ) -> TransformOutputResult: + """Given inference results, perform transformations required to + transmit results to the requestor. + :param request: The request that triggered the pipeline + :param execute_result: The result of inference wrapped in an ExecuteResult + :return:""" + + # @staticmethod + # @abstractmethod + # def serialize_reply( + # request: InferenceRequest, results: OutputTransformResult + # ) -> bytes: + # """Given an output, serialize to bytes for transport + # :param reply: The result of the inference pipeline + # :return: a byte-serialized version of the reply""" diff --git a/tests/mli/__init__.py b/tests/mli/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/mli/channel.py b/tests/mli/channel.py new file mode 100644 index 0000000000..4bc2014ea3 --- /dev/null +++ b/tests/mli/channel.py @@ -0,0 +1,59 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib +import typing as t + +from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class FileSystemCommChannel(CommChannelBase): + """Passes messages by writing to a file""" + + def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: + """Initialize the FileSystemCommChannel instance""" + if not isinstance(key, bytes): + super().__init__(key.as_posix().encode("utf-8")) + self._file_path = key + else: + super().__init__(key) + self._file_path = pathlib.Path(key.decode("utf-8")) + + if not self._file_path.parent.exists(): + self._file_path.parent.mkdir(parents=True) + + self._file_path.touch() + + def send(self, value: bytes) -> None: + """Send a message throuh the underlying communication channel + :param value: The value to send""" + logger.debug( + f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}" + ) + self._file_path.write_bytes(value) diff --git a/tests/mli/featurestore.py b/tests/mli/featurestore.py new file mode 100644 index 0000000000..93b3134318 --- /dev/null +++ b/tests/mli/featurestore.py @@ -0,0 +1,128 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib +import typing as t + +import smartsim.error as sse +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore + + +class MemoryFeatureStore(FeatureStore): + """A feature store with values persisted only in local memory""" + + def __init__(self) -> None: + """Initialize the MemoryFeatureStore instance""" + self._storage: t.Dict[str, bytes] = {} + + def __getitem__(self, key: str) -> bytes: + """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" + if key not in self._storage: + raise sse.SmartSimError(f"{key} not found in feature store") + return self._storage[key] + + def __setitem__(self, key: str, value: bytes) -> None: + """Membership operator to test for a key existing within the feature store. + Return `True` if the key is found, `False` otherwise + :param key: Unique key of an item to retrieve from the feature store""" + self._storage[key] = value + + def __contains__(self, key: str) -> bool: + """Membership operator to test for a key existing within the feature store. + Return `True` if the key is found, `False` otherwise + :param key: Unique key of an item to retrieve from the feature store""" + return key in self._storage + + +class FileSystemFeatureStore(FeatureStore): + """Alternative feature store implementation for testing. Stores all + data on the file system""" + + def __init__(self, storage_dir: t.Optional[pathlib.Path] = None) -> None: + """Initialize the FileSystemFeatureStore instance + :param storage_dir: (optional) root directory to store all data relative to""" + self._storage_dir = storage_dir + + def __getitem__(self, key: str) -> bytes: + """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" + path = self._key_path(key) + if not path.exists(): + raise sse.SmartSimError(f"{path} not found in feature store") + return path.read_bytes() + + def __setitem__(self, key: str, value: bytes) -> None: + """Assign a value using key + :param key: Unique key of an item to set in the feature store + :param value: Value to persist in the feature store""" + path = self._key_path(key, create=True) + path.write_bytes(value) + + def __contains__(self, key: str) -> bool: + """Membership operator to test for a key existing within the feature store. + Return `True` if the key is found, `False` otherwise + :param key: Unique key of an item to retrieve from the feature store""" + path = self._key_path(key) + return path.exists() + + def _key_path(self, key: str, create: bool = False) -> pathlib.Path: + """Given a key, return a path that is optionally combined with a base + directory used by the FileSystemFeatureStore. + :param key: Unique key of an item to retrieve from the feature store""" + value = pathlib.Path(key) + + if self._storage_dir: + value = self._storage_dir / key + + if create: + value.parent.mkdir(parents=True, exist_ok=True) + + return value + + +class DragonDict: + """Mock implementation of a dragon dictionary""" + + def __init__(self) -> None: + """Initialize the mock DragonDict instance""" + self._storage: t.Dict[bytes, t.Any] = {} + + def __getitem__(self, key: bytes) -> t.Any: + """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" + return self._storage[key] + + def __setitem__(self, key: bytes, value: t.Any) -> None: + """Assign a value using key + :param key: Unique key of an item to set in the feature store + :param value: Value to persist in the feature store""" + self._storage[key] = value + + def __contains__(self, key: bytes) -> bool: + """Return `True` if the key is found, `False` otherwise + :param key: Unique key of an item to retrieve from the feature store""" + return key in self._storage diff --git a/tests/mli/test_core_machine_learning_worker.py b/tests/mli/test_core_machine_learning_worker.py new file mode 100644 index 0000000000..cff02c9c1c --- /dev/null +++ b/tests/mli/test_core_machine_learning_worker.py @@ -0,0 +1,321 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib +import time + +import pytest +import torch + +import smartsim.error as sse +from smartsim._core.mli.infrastructure.worker.worker import ( + InferenceRequest, + MachineLearningWorkerCore, + TransformInputResult, + TransformOutputResult, +) +from smartsim._core.utils import installed_redisai_backends + +from .featurestore import FileSystemFeatureStore, MemoryFeatureStore + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_b + +# retrieved from pytest fixtures +is_dragon = ( + pytest.test_launcher == "dragon" if hasattr(pytest, "test_launcher") else False +) +torch_available = "torch" in installed_redisai_backends() + + +@pytest.fixture +def persist_torch_model(test_dir: str) -> pathlib.Path: + ts_start = time.time_ns() + print("Starting model file creation...") + test_path = pathlib.Path(test_dir) + model_path = test_path / "basic.pt" + + model = torch.nn.Linear(2, 1) + torch.save(model, model_path) + ts_end = time.time_ns() + + ts_elapsed = (ts_end - ts_start) / 1000000000 + print(f"Model file creation took {ts_elapsed} seconds") + return model_path + + +@pytest.fixture +def persist_torch_tensor(test_dir: str) -> pathlib.Path: + ts_start = time.time_ns() + print("Starting model file creation...") + test_path = pathlib.Path(test_dir) + file_path = test_path / "tensor.pt" + + tensor = torch.randn((100, 100, 2)) + torch.save(tensor, file_path) + ts_end = time.time_ns() + + ts_elapsed = (ts_end - ts_start) / 1000000000 + print(f"Tensor file creation took {ts_elapsed} seconds") + return file_path + + +@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +def test_fetch_model_disk(persist_torch_model: pathlib.Path) -> None: + """Verify that the ML worker successfully retrieves a model + when given a valid (file system) key""" + worker = MachineLearningWorkerCore + key = str(persist_torch_model) + feature_store = FileSystemFeatureStore() + feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes() + + request = InferenceRequest(model_key=key) + + fetch_result = worker.fetch_model(request, feature_store) + assert fetch_result.model_bytes + assert fetch_result.model_bytes == persist_torch_model.read_bytes() + + +def test_fetch_model_disk_missing() -> None: + """Verify that the ML worker fails to retrieves a model + when given an invalid (file system) key""" + worker = MachineLearningWorkerCore + feature_store = MemoryFeatureStore() + + key = "/path/that/doesnt/exist" + + request = InferenceRequest(model_key=key) + + with pytest.raises(sse.SmartSimError) as ex: + worker.fetch_model(request, feature_store) + + # ensure the error message includes key-identifying information + assert key in ex.value.args[0] + + +@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +def test_fetch_model_feature_store(persist_torch_model: pathlib.Path) -> None: + """Verify that the ML worker successfully retrieves a model + when given a valid (file system) key""" + worker = MachineLearningWorkerCore + + # create a key to retrieve from the feature store + key = "test-model" + + # put model bytes into the feature store + feature_store = MemoryFeatureStore() + feature_store[key] = persist_torch_model.read_bytes() + + request = InferenceRequest(model_key=key) + fetch_result = worker.fetch_model(request, feature_store) + assert fetch_result.model_bytes + assert fetch_result.model_bytes == persist_torch_model.read_bytes() + + +def test_fetch_model_feature_store_missing() -> None: + """Verify that the ML worker fails to retrieves a model + when given an invalid (feature store) key""" + worker = MachineLearningWorkerCore + + bad_key = "some-key" + feature_store = MemoryFeatureStore() + + request = InferenceRequest(model_key=bad_key) + + # todo: consider that raising this exception shows impl. replace... + with pytest.raises(sse.SmartSimError) as ex: + worker.fetch_model(request, feature_store) + + # ensure the error message includes key-identifying information + assert bad_key in ex.value.args[0] + + +@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +def test_fetch_model_memory(persist_torch_model: pathlib.Path) -> None: + """Verify that the ML worker successfully retrieves a model + when given a valid (file system) key""" + worker = MachineLearningWorkerCore + + key = "test-model" + feature_store = MemoryFeatureStore() + feature_store[key] = persist_torch_model.read_bytes() + + request = InferenceRequest(model_key=key) + + fetch_result = worker.fetch_model(request, feature_store) + assert fetch_result.model_bytes + assert fetch_result.model_bytes == persist_torch_model.read_bytes() + + +@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +def test_fetch_input_disk(persist_torch_tensor: pathlib.Path) -> None: + """Verify that the ML worker successfully retrieves a tensor/input + when given a valid (file system) key""" + tensor_name = str(persist_torch_tensor) + + request = InferenceRequest(input_keys=[tensor_name]) + worker = MachineLearningWorkerCore + + feature_store = MemoryFeatureStore() + feature_store[tensor_name] = persist_torch_tensor.read_bytes() + + fetch_result = worker.fetch_inputs(request, feature_store) + assert fetch_result.inputs is not None + + +def test_fetch_input_disk_missing() -> None: + """Verify that the ML worker fails to retrieves a tensor/input + when given an invalid (file system) key""" + worker = MachineLearningWorkerCore + + key = "/path/that/doesnt/exist" + feature_store = MemoryFeatureStore() + + request = InferenceRequest(input_keys=[key]) + + with pytest.raises(sse.SmartSimError) as ex: + worker.fetch_inputs(request, feature_store) + + # ensure the error message includes key-identifying information + assert key in ex.value.args[0] + + +@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None: + """Verify that the ML worker successfully retrieves a tensor/input + when given a valid (feature store) key""" + worker = MachineLearningWorkerCore + + tensor_name = "test-tensor" + feature_store = MemoryFeatureStore() + + request = InferenceRequest(input_keys=[tensor_name]) + + # put model bytes into the feature store + feature_store[tensor_name] = persist_torch_tensor.read_bytes() + + fetch_result = worker.fetch_inputs(request, feature_store) + assert fetch_result.inputs + assert list(fetch_result.inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10] + + +@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) -> None: + """Verify that the ML worker successfully retrieves multiple tensor/input + when given a valid collection of (feature store) keys""" + worker = MachineLearningWorkerCore + + tensor_name = "test-tensor" + feature_store = MemoryFeatureStore() + + # put model bytes into the feature store + body1 = persist_torch_tensor.read_bytes() + feature_store[tensor_name + "1"] = body1 + + body2 = b"abcdefghijklmnopqrstuvwxyz" + feature_store[tensor_name + "2"] = body2 + + body3 = b"mnopqrstuvwxyzabcdefghijkl" + feature_store[tensor_name + "3"] = body3 + + request = InferenceRequest( + input_keys=[tensor_name + "1", tensor_name + "2", tensor_name + "3"] + ) + + fetch_result = worker.fetch_inputs(request, feature_store) + + raw_bytes = list(fetch_result.inputs) + assert raw_bytes + assert raw_bytes[0][:10] == persist_torch_tensor.read_bytes()[:10] + assert raw_bytes[1][:10] == body2[:10] + assert raw_bytes[2][:10] == body3[:10] + + +def test_fetch_input_feature_store_missing() -> None: + """Verify that the ML worker fails to retrieves a tensor/input + when given an invalid (feature store) key""" + worker = MachineLearningWorkerCore + + bad_key = "some-key" + feature_store = MemoryFeatureStore() + request = InferenceRequest(input_keys=[bad_key]) + + with pytest.raises(sse.SmartSimError) as ex: + worker.fetch_inputs(request, feature_store) + + # ensure the error message includes key-identifying information + assert bad_key in ex.value.args[0] + + +@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +def test_fetch_input_memory(persist_torch_tensor: pathlib.Path) -> None: + """Verify that the ML worker successfully retrieves a tensor/input + when given a valid (file system) key""" + worker = MachineLearningWorkerCore + feature_store = MemoryFeatureStore() + + model_name = "test-model" + feature_store[model_name] = persist_torch_tensor.read_bytes() + request = InferenceRequest(input_keys=[model_name]) + + fetch_result = worker.fetch_inputs(request, feature_store) + assert fetch_result.inputs is not None + + +def test_batch_requests() -> None: + """Verify batch requests handles an empty data set gracefully""" + worker = MachineLearningWorkerCore + result = TransformInputResult([]) + + request = InferenceRequest(batch_size=10) + + with pytest.raises(NotImplementedError): + # NOTE: we expect this to fail since it's not yet implemented. + # TODO: once implemented, replace this expectation of failure... + worker.batch_requests(request, result) + + +def test_place_outputs() -> None: + """Verify outputs are shared using the feature store""" + worker = MachineLearningWorkerCore + + key_name = "test-model" + feature_store = MemoryFeatureStore() + + # create a key to retrieve from the feature store + keys = [key_name + "1", key_name + "2", key_name + "3"] + data = [b"abcdef", b"ghijkl", b"mnopqr"] + + for k, v in zip(keys, data): + feature_store[k] = v + + request = InferenceRequest(output_keys=keys) + transform_result = TransformOutputResult(data, [1], "c", "float32") + + worker.place_output(request, transform_result, feature_store) + + for i in range(3): + assert feature_store[keys[i]] == data[i] diff --git a/tests/mli/test_default_torch_worker.py b/tests/mli/test_default_torch_worker.py new file mode 100644 index 0000000000..b2ec6c3dca --- /dev/null +++ b/tests/mli/test_default_torch_worker.py @@ -0,0 +1,206 @@ +# # BSD 2-Clause License +# # +# # Copyright (c) 2021-2024, Hewlett Packard Enterprise +# # All rights reserved. +# # +# # Redistribution and use in source and binary forms, with or without +# # modification, are permitted provided that the following conditions are met: +# # +# # 1. Redistributions of source code must retain the above copyright notice, this +# # list of conditions and the following disclaimer. +# # +# # 2. Redistributions in binary form must reproduce the above copyright notice, +# # this list of conditions and the following disclaimer in the documentation +# # and/or other materials provided with the distribution. +# # +# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# import io +# import pathlib +# import typing as t + +# import pytest +# import torch + +# from smartsim._core.mli.infrastructure.worker.integratedtorchworker import ( +# IntegratedTorchWorker, +# ) +# import smartsim.error as sse +# from smartsim._core.mli.infrastructure import MemoryFeatureStore +# from smartsim._core.mli.infrastructure.worker.worker import ( +# ExecuteResult, +# FetchInputResult, +# FetchModelResult, +# InferenceRequest, +# TransformInputResult, +# LoadModelResult, +# ) +# from smartsim._core.utils import installed_redisai_backends + +# # The tests in this file belong to the group_a group +# pytestmark = pytest.mark.group_b + +# # retrieved from pytest fixtures +# is_dragon = pytest.test_launcher == "dragon" +# torch_available = "torch" in installed_redisai_backends() + + +# @pytest.fixture +# def persist_torch_model(test_dir: str) -> pathlib.Path: +# test_path = pathlib.Path(test_dir) +# model_path = test_path / "basic.pt" + +# model = torch.nn.Linear(2, 1) +# torch.save(model, model_path) + +# return model_path + + +# # def test_deserialize() -> None: +# # """Verify that serialized requests are properly deserialized to +# # and converted to the internal representation used by ML workers""" +# # worker = SampleTorchWorker +# # buffer = io.BytesIO() + +# # exp_model_key = "model-key" +# # msg = InferenceRequest(model_key=exp_model_key) +# # pickle.dump(msg, buffer) + +# # deserialized: InferenceRequest = worker.deserialize(buffer.getvalue()) + +# # assert deserialized.model_key == exp_model_key +# # # assert deserialized.backend == exp_backend + + +# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +# def test_load_model_from_disk(persist_torch_model: pathlib.Path) -> None: +# """Verify that a model can be loaded using a FileSystemFeatureStore""" +# worker = IntegratedTorchWorker +# request = InferenceRequest(raw_model=persist_torch_model.read_bytes()) + +# fetch_result = FetchModelResult(persist_torch_model.read_bytes()) +# load_result = worker.load_model(request, fetch_result) + +# input = torch.randn(2) +# pred = load_result.model(input) + +# assert pred + + +# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +# def test_transform_input() -> None: +# """Verify that the default input transform operation is a no-op copy""" +# rows, cols = 1, 4 +# num_values = 7 +# tensors = [torch.randn((rows, cols)) for _ in range(num_values)] + +# request = InferenceRequest() + +# inputs: t.List[bytes] = [] +# for tensor in tensors: +# buffer = io.BytesIO() +# torch.save(tensor, buffer) +# inputs.append(buffer.getvalue()) + +# fetch_result = FetchInputResult(inputs) +# worker = IntegratedTorchWorker +# result = worker.transform_input(request, fetch_result) +# transformed: t.Collection[torch.Tensor] = result.transformed + +# assert len(transformed) == num_values + +# for output, expected in zip(transformed, tensors): +# assert output.shape == expected.shape +# assert output.equal(expected) + +# transformed = list(transformed) + +# original: torch.Tensor = tensors[0] +# assert transformed[0].equal(original) + +# # verify a copy was made +# transformed[0] = 2 * transformed[0] +# assert transformed[0].equal(2 * original) + + +# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +# def test_execute_model(persist_torch_model: pathlib.Path) -> None: +# """Verify that a model executes corrrectly via the worker""" + +# # put model bytes into memory +# model_name = "test-key" +# feature_store = MemoryFeatureStore() +# feature_store[model_name] = persist_torch_model.read_bytes() + +# worker = IntegratedTorchWorker +# request = InferenceRequest(model_key=model_name) +# fetch_result = FetchModelResult(persist_torch_model.read_bytes()) +# load_result = worker.load_model(request, fetch_result) + +# value = torch.randn(2) +# transform_result = TransformInputResult([value]) + +# execute_result = worker.execute(request, load_result, transform_result) + +# assert execute_result.predictions is not None + + +# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +# def test_execute_missing_model(persist_torch_model: pathlib.Path) -> None: +# """Verify that a executing a model with an invalid key fails cleanly""" + +# # use key that references an un-set model value +# model_name = "test-key" +# feature_store = MemoryFeatureStore() +# feature_store[model_name] = persist_torch_model.read_bytes() + +# worker = IntegratedTorchWorker +# request = InferenceRequest(input_keys=[model_name]) + +# load_result = LoadModelResult(None) +# transform_result = TransformInputResult( +# [torch.randn(2), torch.randn(2), torch.randn(2)] +# ) + +# with pytest.raises(sse.SmartSimError) as ex: +# worker.execute(request, load_result, transform_result) + +# assert "Model must be loaded" in ex.value.args[0] + + +# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +# def test_transform_output() -> None: +# """Verify that the default output transform operation is a no-op copy""" +# rows, cols = 1, 4 +# num_values = 7 +# inputs = [torch.randn((rows, cols)) for _ in range(num_values)] +# exp_outputs = [torch.Tensor(tensor) for tensor in inputs] + +# worker = SampleTorchWorker +# request = InferenceRequest() +# exec_result = ExecuteResult(inputs) + +# result = worker.transform_output(request, exec_result) + +# assert len(result.outputs) == num_values + +# for output, expected in zip(result.outputs, exp_outputs): +# assert output.shape == expected.shape +# assert output.equal(expected) + +# transformed = list(result.outputs) + +# # verify a copy was made +# original: torch.Tensor = inputs[0] +# transformed[0] = 2 * transformed[0] + +# assert transformed[0].equal(2 * original) diff --git a/tests/mli/test_integrated_torch_worker.py b/tests/mli/test_integrated_torch_worker.py new file mode 100644 index 0000000000..3731aabf11 --- /dev/null +++ b/tests/mli/test_integrated_torch_worker.py @@ -0,0 +1,290 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib +import typing as t + +import pytest +import torch + +# import smartsim.error as sse +# from smartsim._core.mli.infrastructure.control import workermanager as mli +# from smartsim._core.mli.message_handler import MessageHandler +from smartsim._core.utils import installed_redisai_backends + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + +# retrieved from pytest fixtures +is_dragon = pytest.test_launcher == "dragon" +torch_available = "torch" in installed_redisai_backends() + + +@pytest.fixture +def persist_torch_model(test_dir: str) -> pathlib.Path: + test_path = pathlib.Path(test_dir) + model_path = test_path / "basic.pt" + + model = torch.nn.Linear(2, 1) + torch.save(model, model_path) + + return model_path + + +# todo: move deserialization tests into suite for worker manager where serialization occurs + + +# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +# def test_deserialize_direct_request(persist_torch_model: pathlib.Path) -> None: +# """Verify that a direct requestis deserialized properly""" +# worker = mli.IntegratedTorchWorker +# # feature_store = mli.MemoryFeatureStore() + +# model_bytes = persist_torch_model.read_bytes() +# input_tensor = torch.randn(2) + +# expected_device = "cpu" +# expected_callback_channel = b"faux_channel_descriptor_bytes" +# callback_channel = mli.DragonCommChannel.find(expected_callback_channel) + +# message_tensor_input = MessageHandler.build_tensor( +# input_tensor, "c", "float32", [2] +# ) + +# request = MessageHandler.build_request( +# reply_channel=callback_channel.descriptor, +# model=model_bytes, +# device=expected_device, +# inputs=[message_tensor_input], +# outputs=[], +# custom_attributes=None, +# ) + +# msg_bytes = MessageHandler.serialize_request(request) + +# inference_request = worker.deserialize(msg_bytes) +# assert inference_request.device == expected_device +# assert inference_request.callback._descriptor == expected_callback_channel + + +# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +# def test_deserialize_indirect_request(persist_torch_model: pathlib.Path) -> None: +# """Verify that an indirect request is deserialized correctly""" +# worker = mli.IntegratedTorchWorker +# # feature_store = mli.MemoryFeatureStore() + +# model_key = "persisted-model" +# # model_bytes = persist_torch_model.read_bytes() +# # feature_store[model_key] = model_bytes + +# input_key = f"demo-input" +# # input_tensor = torch.randn(2) +# # feature_store[input_key] = input_tensor + +# expected_device = "cpu" +# expected_callback_channel = b"faux_channel_descriptor_bytes" +# callback_channel = mli.DragonCommChannel.find(expected_callback_channel) + +# output_key = f"demo-output" + +# message_tensor_output_key = MessageHandler.build_tensor_key(output_key) +# message_tensor_input_key = MessageHandler.build_tensor_key(input_key) +# message_model_key = MessageHandler.build_model_key(model_key) + +# request = MessageHandler.build_request( +# reply_channel=callback_channel.descriptor, +# model=message_model_key, +# device=expected_device, +# inputs=[message_tensor_input_key], +# outputs=[message_tensor_output_key], +# custom_attributes=None, +# ) + +# msg_bytes = MessageHandler.serialize_request(request) + +# inference_request = worker.deserialize(msg_bytes) +# assert inference_request.device == expected_device +# assert inference_request.callback._descriptor == expected_callback_channel + + +# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +# def test_deserialize_mixed_mode_indirect_inputs( +# persist_torch_model: pathlib.Path, +# ) -> None: +# """Verify that a mixed mode (combining direct and indirect inputs, models, outputs) +# with indirect inputs is deserialized correctly""" +# worker = mli.IntegratedTorchWorker +# # feature_store = mli.MemoryFeatureStore() + +# # model_key = "persisted-model" +# model_bytes = persist_torch_model.read_bytes() +# # feature_store[model_key] = model_bytes + +# input_key = f"demo-input" +# # input_tensor = torch.randn(2) +# # feature_store[input_key] = input_tensor + +# expected_device = "cpu" +# expected_callback_channel = b"faux_channel_descriptor_bytes" +# callback_channel = mli.DragonCommChannel.find(expected_callback_channel) + +# output_key = f"demo-output" + +# message_tensor_output_key = MessageHandler.build_tensor_key(output_key) +# message_tensor_input_key = MessageHandler.build_tensor_key(input_key) +# # message_model_key = MessageHandler.build_model_key(model_key) + +# request = MessageHandler.build_request( +# reply_channel=callback_channel.descriptor, +# model=model_bytes, +# device=expected_device, +# inputs=[message_tensor_input_key], +# # outputs=[message_tensor_output_key], +# outputs=[], +# custom_attributes=None, +# ) + +# msg_bytes = MessageHandler.serialize_request(request) + +# inference_request = worker.deserialize(msg_bytes) +# assert inference_request.device == expected_device +# assert inference_request.callback._descriptor == expected_callback_channel + + +# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +# def test_deserialize_mixed_mode_indirect_outputs( +# persist_torch_model: pathlib.Path, +# ) -> None: +# """Verify that a mixed mode (combining direct and indirect inputs, models, outputs) +# with indirect outputs is deserialized correctly""" +# worker = mli.IntegratedTorchWorker +# # feature_store = mli.MemoryFeatureStore() + +# # model_key = "persisted-model" +# model_bytes = persist_torch_model.read_bytes() +# # feature_store[model_key] = model_bytes + +# input_key = f"demo-input" +# input_tensor = torch.randn(2) +# # feature_store[input_key] = input_tensor + +# expected_device = "cpu" +# expected_callback_channel = b"faux_channel_descriptor_bytes" +# callback_channel = mli.DragonCommChannel.find(expected_callback_channel) + +# output_key = f"demo-output" + +# message_tensor_output_key = MessageHandler.build_tensor_key(output_key) +# # message_tensor_input_key = MessageHandler.build_tensor_key(input_key) +# # message_model_key = MessageHandler.build_model_key(model_key) +# message_tensor_input = MessageHandler.build_tensor( +# input_tensor, "c", "float32", [2] +# ) + +# request = MessageHandler.build_request( +# reply_channel=callback_channel.descriptor, +# model=model_bytes, +# device=expected_device, +# inputs=[message_tensor_input], +# # outputs=[message_tensor_output_key], +# outputs=[message_tensor_output_key], +# custom_attributes=None, +# ) + +# msg_bytes = MessageHandler.serialize_request(request) + +# inference_request = worker.deserialize(msg_bytes) +# assert inference_request.device == expected_device +# assert inference_request.callback._descriptor == expected_callback_channel + + +# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +# def test_deserialize_mixed_mode_indirect_model( +# persist_torch_model: pathlib.Path, +# ) -> None: +# """Verify that a mixed mode (combining direct and indirect inputs, models, outputs) +# with indirect outputs is deserialized correctly""" +# worker = mli.IntegratedTorchWorker +# # feature_store = mli.MemoryFeatureStore() + +# model_key = "persisted-model" +# # model_bytes = persist_torch_model.read_bytes() +# # feature_store[model_key] = model_bytes + +# # input_key = f"demo-input" +# input_tensor = torch.randn(2) +# # feature_store[input_key] = input_tensor + +# expected_device = "cpu" +# expected_callback_channel = b"faux_channel_descriptor_bytes" +# callback_channel = mli.DragonCommChannel.find(expected_callback_channel) + +# output_key = f"demo-output" + +# # message_tensor_output_key = MessageHandler.build_tensor_key(output_key) +# # message_tensor_input_key = MessageHandler.build_tensor_key(input_key) +# message_model_key = MessageHandler.build_model_key(model_key) +# message_tensor_input = MessageHandler.build_tensor( +# input_tensor, "c", "float32", [2] +# ) + +# request = MessageHandler.build_request( +# reply_channel=callback_channel.descriptor, +# model=message_model_key, +# device=expected_device, +# inputs=[message_tensor_input], +# # outputs=[message_tensor_output_key], +# outputs=[], +# custom_attributes=None, +# ) + +# msg_bytes = MessageHandler.serialize_request(request) + +# inference_request = worker.deserialize(msg_bytes) +# assert inference_request.device == expected_device +# assert inference_request.callback._descriptor == expected_callback_channel + + +# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") +# def test_serialize(test_dir: str, persist_torch_model: pathlib.Path) -> None: +# """Verify that the worker correctly executes reply serialization""" +# worker = mli.IntegratedTorchWorker + +# reply = mli.InferenceReply() +# reply.output_keys = ["foo", "bar"] + +# # use the worker implementation of reply serialization to get bytes for +# # use on the callback channel +# reply_bytes = worker.serialize_reply(reply) +# assert reply_bytes is not None + +# # deserialize to verity the mapping in the worker.serialize_reply was correct +# actual_reply = MessageHandler.deserialize_response(reply_bytes) + +# actual_tensor_keys = [tk.key for tk in actual_reply.result.keys] +# assert set(actual_tensor_keys) == set(reply.output_keys) +# assert actual_reply.status == 200 +# assert actual_reply.statusMessage == "success" diff --git a/tests/mli/test_service.py b/tests/mli/test_service.py new file mode 100644 index 0000000000..617738f949 --- /dev/null +++ b/tests/mli/test_service.py @@ -0,0 +1,205 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import datetime +import multiprocessing as mp +import pathlib +import typing as t +from asyncore import loop + +import pytest +import torch + +import smartsim.error as sse +from smartsim._core.entrypoints.service import Service + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_a + + +class SimpleService(Service): + """Mock implementation of a service that counts method invocations + using the base class event hooks.""" + + def __init__( + self, + log: t.List[str], + quit_after: int = 0, + as_service: bool = False, + cooldown: int = 0, + loop_delay: int = 0, + ) -> None: + super().__init__(as_service, cooldown, loop_delay) + self._log = log + self._quit_after = quit_after + self.num_iterations = 0 + self.num_starts = 0 + self.num_shutdowns = 0 + self.num_cooldowns = 0 + self.num_can_shutdown = 0 + self.num_delays = 0 + + def _on_iteration(self) -> None: + self.num_iterations += 1 + + def _on_start(self) -> None: + self.num_starts += 1 + + def _on_shutdown(self) -> None: + self.num_shutdowns += 1 + + def _on_cooldown_elapsed(self) -> None: + self.num_cooldowns += 1 + + def _on_delay(self) -> None: + self.num_delays += 1 + + def _can_shutdown(self) -> bool: + self.num_can_shutdown += 1 + if self._quit_after == 0: + return True + + return self.num_iterations >= self._quit_after + + +def test_service_init() -> None: + """Verify expected default values after Service initialization""" + activity_log: t.List[str] = [] + service = SimpleService(activity_log) + + assert service._as_service is False + assert service._cooldown == 0 + assert service._loop_delay == 0 + + +def test_service_run_once() -> None: + """Verify the service completes after a single call to _on_iteration""" + activity_log: t.List[str] = [] + service = SimpleService(activity_log) + + service.execute() + + assert service.num_iterations == 1 + assert service.num_starts == 1 + assert service.num_cooldowns == 0 # it never exceeds a cooldown period + assert service.num_can_shutdown == 0 # it automatically exits in run once + assert service.num_shutdowns == 1 + + +@pytest.mark.parametrize( + "num_iterations", + [ + pytest.param(0, id="Immediate Shutdown"), + pytest.param(1, id="1x"), + pytest.param(2, id="2x"), + pytest.param(4, id="4x"), + pytest.param(8, id="8x"), + pytest.param(16, id="16x"), + pytest.param(32, id="32x"), + ], +) +def test_service_run_until_can_shutdown(num_iterations: int) -> None: + """Verify the service completes after a dynamic number of iterations + based on the return value of `_can_shutdown`""" + activity_log: t.List[str] = [] + + service = SimpleService(activity_log, quit_after=num_iterations, as_service=True) + + service.execute() + + if num_iterations == 0: + # no matter what, it should always execute the _on_iteration method + assert service.num_iterations == 1 + else: + assert service.num_iterations == num_iterations + + assert service.num_starts == 1 + assert service.num_shutdowns == 1 + + +@pytest.mark.parametrize( + "cooldown", + [ + pytest.param(1, id="1s"), + pytest.param(3, id="3s"), + pytest.param(5, id="5s"), + ], +) +def test_service_cooldown(cooldown: int) -> None: + """Verify that the cooldown period is respected""" + activity_log: t.List[str] = [] + + service = SimpleService( + activity_log, + quit_after=1, + as_service=True, + cooldown=cooldown, + loop_delay=0, + ) + + ts0 = datetime.datetime.now() + service.execute() + ts1 = datetime.datetime.now() + + fudge_factor = 1.1 # allow a little bit of wiggle room for the loop + duration_in_seconds = (ts1 - ts0).total_seconds() + + assert duration_in_seconds <= cooldown * fudge_factor + assert service.num_cooldowns == 1 + assert service.num_shutdowns == 1 + + +@pytest.mark.parametrize( + "delay, num_iterations", + [ + pytest.param(1, 3, id="1s delay, 3x"), + pytest.param(3, 2, id="2s delay, 2x"), + pytest.param(5, 1, id="5s delay, 1x"), + ], +) +def test_service_delay(delay: int, num_iterations: int) -> None: + """Verify that a delay is correctly added between iterations""" + activity_log: t.List[str] = [] + + service = SimpleService( + activity_log, + quit_after=num_iterations, + as_service=True, + cooldown=0, + loop_delay=delay, + ) + + ts0 = datetime.datetime.now() + service.execute() + ts1 = datetime.datetime.now() + + # the expected duration is the sum of the delay between each iteration + expected_duration = (num_iterations + 1) * delay + duration_in_seconds = (ts1 - ts0).total_seconds() + + assert duration_in_seconds <= expected_duration + assert service.num_cooldowns == 0 + assert service.num_shutdowns == 1 diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py new file mode 100644 index 0000000000..3d88ee4088 --- /dev/null +++ b/tests/mli/test_worker_manager.py @@ -0,0 +1,196 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io +import logging +import multiprocessing as mp +import pathlib +import time +import typing as t + +import pytest +import torch + +from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.message_handler import MessageHandler +from smartsim.log import get_logger + +from .channel import FileSystemCommChannel +from .featurestore import FileSystemFeatureStore +from .worker import IntegratedTorchWorker + +logger = get_logger(__name__) +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_a + + +def mock_work(worker_manager_queue: "mp.Queue[bytes]") -> None: + """Mock event producer for triggering the inference pipeline""" + # todo: move to unit tests + while True: + time.sleep(1) + # 1. for demo, ignore upstream and just put stuff into downstream + # 2. for demo, only one downstream but we'd normally have to filter + # msg content and send to the correct downstream (worker) queue + timestamp = time.time_ns() + output_dir = "/lus/bnchlu1/mcbridch/code/ss/_tmp" + output_path = pathlib.Path(output_dir) + + mock_channel = output_path / f"brainstorm-{timestamp}.txt" + mock_model = output_path / "brainstorm.pt" + + output_path.mkdir(parents=True, exist_ok=True) + mock_channel.touch() + mock_model.touch() + + msg = f"PyTorch:{mock_model}:MockInputToReplace:{mock_channel}" + worker_manager_queue.put(msg.encode("utf-8")) + + +def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: + """Create a simple torch model and persist to disk for + testing purposes. + + TODO: remove once unit tests are in place""" + # test_path = pathlib.Path(work_dir) + if not model_path.parent.exists(): + model_path.parent.mkdir(parents=True, exist_ok=True) + + model_path.unlink(missing_ok=True) + # model_path = test_path / "basic.pt" + + model = torch.nn.Linear(2, 1) + torch.save(model, model_path) + + return model_path + + +def mock_messages( + worker_manager_queue: "mp.Queue[bytes]", + feature_store: FeatureStore, + feature_store_root_dir: pathlib.Path, + comm_channel_root_dir: pathlib.Path, +) -> None: + """Mock event producer for triggering the inference pipeline""" + feature_store_root_dir.mkdir(parents=True, exist_ok=True) + comm_channel_root_dir.mkdir(parents=True, exist_ok=True) + + model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt") + model_bytes = model_path.read_bytes() + model_key = str(feature_store_root_dir / "model_fs.pt") + + feature_store[model_key] = model_bytes + + iteration_number = 0 + + while True: + iteration_number += 1 + time.sleep(1) + # 1. for demo, ignore upstream and just put stuff into downstream + # 2. for demo, only one downstream but we'd normally have to filter + # msg content and send to the correct downstream (worker) queue + # timestamp = time.time_ns() + # mock_channel = test_path / f"brainstorm-{timestamp}.txt" + # mock_channel.touch() + + # thread - just look for key (wait for keys) + # call checkpoint, try to get non-persistent key, it blocks + # working set size > 1 has side-effects + # only incurs cost when working set size has been exceeded + + expected_device: t.Literal["cpu", "gpu"] = "cpu" + channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt" + callback_channel = FileSystemCommChannel(pathlib.Path(channel_key)) + + input_path = feature_store_root_dir / f"{iteration_number}/input.pt" + output_path = feature_store_root_dir / f"{iteration_number}/output.pt" + + input_key = str(input_path) + output_key = str(output_path) + + buffer = io.BytesIO() + tensor = torch.randn((1, 2), dtype=torch.float32) + torch.save(tensor, buffer) + feature_store[input_key] = buffer.getvalue() + + message_tensor_output_key = MessageHandler.build_tensor_key(output_key) + message_tensor_input_key = MessageHandler.build_tensor_key(input_key) + message_model_key = MessageHandler.build_model_key(model_key) + + request = MessageHandler.build_request( + reply_channel=callback_channel.descriptor, + model=message_model_key, + device=expected_device, + inputs=[message_tensor_input_key], + outputs=[message_tensor_output_key], + custom_attributes=None, + ) + request_bytes = MessageHandler.serialize_request(request) + worker_manager_queue.put(request_bytes) + + +@pytest.fixture +def prepare_environment(test_dir: str) -> pathlib.Path: + """Cleanup prior outputs to run demo repeatedly""" + path = pathlib.Path(f"{test_dir}/workermanager.log") + logging.basicConfig(filename=path.absolute(), level=logging.DEBUG) + return path + + +def test_worker_manager(prepare_environment: pathlib.Path) -> None: + """Test the worker manager""" + + test_path = prepare_environment + fs_path = test_path / "feature_store" + comm_path = test_path / "comm_store" + + work_queue: "mp.Queue[bytes]" = mp.Queue() + integrated_worker = IntegratedTorchWorker() + file_system_store = FileSystemFeatureStore() + + worker_manager = WorkerManager( + work_queue, + integrated_worker, + file_system_store, + as_service=True, + cooldown=10, + comm_channel_type=FileSystemCommChannel, + ) + + # create a mock client application to populate the request queue + msg_pump = mp.Process( + target=mock_messages, + args=(work_queue, file_system_store, fs_path, comm_path), + ) + msg_pump.start() + + # # create a process to process commands + process = mp.Process(target=worker_manager.execute) + process.start() + process.join(timeout=5) + process.kill() + msg_pump.kill() diff --git a/tests/mli/worker.py b/tests/mli/worker.py new file mode 100644 index 0000000000..b1de280185 --- /dev/null +++ b/tests/mli/worker.py @@ -0,0 +1,128 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io +import typing as t + +import torch + +import smartsim._core.mli.infrastructure.worker.worker as mliw +import smartsim.error as sse +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class IntegratedTorchWorker(mliw.MachineLearningWorkerBase): + """A minimum implementation of a worker that executes a PyTorch model""" + + # @staticmethod + # def deserialize(request: InferenceRequest) -> t.List[t.Any]: + # # request.input_meta + # # request.raw_inputs + # return request + + @staticmethod + def load_model( + request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult + ) -> mliw.LoadModelResult: + model_bytes = fetch_result.model_bytes or request.raw_model + if not model_bytes: + raise ValueError("Unable to load model without reference object") + + model: torch.nn.Module = torch.load(io.BytesIO(model_bytes)) + result = mliw.LoadModelResult(model) + return result + + @staticmethod + def transform_input( + request: mliw.InferenceRequest, + fetch_result: mliw.FetchInputResult, + ) -> mliw.TransformInputResult: + # extra metadata for assembly can be found in request.input_meta + raw_inputs = request.raw_inputs or fetch_result.inputs + + result: t.List[torch.Tensor] = [] + # should this happen here? + # consider - fortran to c data layout + # is there an intermediate representation before really doing torch.load? + if raw_inputs: + result = [torch.load(io.BytesIO(item)) for item in raw_inputs] + + return mliw.TransformInputResult(result) + + @staticmethod + def execute( + request: mliw.InferenceRequest, + load_result: mliw.LoadModelResult, + transform_result: mliw.TransformInputResult, + ) -> mliw.ExecuteResult: + if not load_result.model: + raise sse.SmartSimError("Model must be loaded to execute") + + model = load_result.model + results = [model(tensor) for tensor in transform_result.transformed] + + execute_result = mliw.ExecuteResult(results) + return execute_result + + @staticmethod + def transform_output( + request: mliw.InferenceRequest, + execute_result: mliw.ExecuteResult, + ) -> mliw.TransformOutputResult: + # transformed = [item.clone() for item in execute_result.predictions] + # return OutputTransformResult(transformed) + + # transformed = [item.bytes() for item in execute_result.predictions] + + # OutputTransformResult.transformed SHOULD be a list of + # capnproto Tensors Or tensor descriptors accompanying bytes + + # send the original tensors... + execute_result.predictions = [t.detach() for t in execute_result.predictions] + # todo: solve sending all tensor metadata that coincisdes with each prediction + return mliw.TransformOutputResult( + execute_result.predictions, [1], "c", "float32" + ) + # return OutputTransformResult(transformed) + + # @staticmethod + # def serialize_reply( + # request: InferenceRequest, results: OutputTransformResult + # ) -> t.Any: + # # results = IntegratedTorchWorker._prepare_outputs(results.outputs) + # # return results + # return None + # # response = MessageHandler.build_response( + # # status=200, # todo: are we satisfied with 0/1 (success, fail) + # # # todo: if not detailed messages, this shouldn't be returned. + # # message="success", + # # result=results, + # # custom_attributes=None, + # # ) + # # serialized_resp = MessageHandler.serialize_response(response) + # # return serialized_resp From ab900b87c4b29683df851bf42be609fd4491b07e Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Tue, 25 Jun 2024 11:34:33 -0700 Subject: [PATCH 03/60] Remove device attribute from schemas (#619) This PR removes `device` from the schemas, MessageHandler, and tests. --- doc/changelog.md | 1 + .../infrastructure/control/workermanager.py | 2 - .../_core/mli/infrastructure/worker/worker.py | 2 - smartsim/_core/mli/message_handler.py | 19 --- .../mli/mli_schemas/request/request.capnp | 21 +-- .../mli/mli_schemas/request/request_capnp.pyi | 3 - tests/mli/test_integrated_torch_worker.py | 15 -- tests/mli/test_worker_manager.py | 2 - tests/test_message_handler/test_request.py | 134 +++--------------- 9 files changed, 24 insertions(+), 175 deletions(-) diff --git a/doc/changelog.md b/doc/changelog.md index e38d234965..e86c93de66 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -16,6 +16,7 @@ Description - Add ML worker manager, sample worker, and feature store - Added schemas and MessageHandler class for de/serialization of inference requests and response messages +- Removed device from schemas, MessageHandler and tests ### Development branch diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index b3b79f7f30..b113f9187e 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -64,7 +64,6 @@ def deserialize_message( request = MessageHandler.deserialize_request(data_blob) # return request - device = request.device model_key: t.Optional[str] = None model_bytes: t.Optional[bytes] = None @@ -106,7 +105,6 @@ def deserialize_message( input_keys=input_keys, raw_model=model_bytes, batch_size=0, - device=device, ) return inference_request diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 99b51e178d..c87722b290 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -50,7 +50,6 @@ def __init__( output_keys: t.Optional[t.List[str]] = None, raw_model: t.Optional[bytes] = None, batch_size: int = 0, - device: t.Optional[str] = None, ): """Initialize the object""" self.model_key = model_key @@ -61,7 +60,6 @@ def __init__( self.input_meta = input_meta or [] self.output_keys = output_keys or [] self.batch_size = batch_size - self.device = device class InferenceReply: diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index 733fa83d98..b17f359c33 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -220,22 +220,6 @@ def _assign_reply_channel( except Exception as e: raise ValueError("Error building reply channel portion of request.") from e - @staticmethod - def _assign_device( - request: request_capnp.Request, device: "request_capnp.Device" - ) -> None: - """ - Assigns a device to the supplied request. - - :param request: Request being built - :param device: Device to be assigned - :raises ValueError: if building fails - """ - try: - request.device = device - except Exception as e: - raise ValueError("Error building device portion of request.") from e - @staticmethod def _assign_inputs( request: request_capnp.Request, @@ -342,7 +326,6 @@ def _assign_custom_request_attributes( def build_request( reply_channel: t.ByteString, model: t.Union[data_references_capnp.ModelKey, t.ByteString], - device: "request_capnp.Device", inputs: t.Union[ t.List[data_references_capnp.TensorKey], t.List[tensor_capnp.Tensor] ], @@ -359,7 +342,6 @@ def build_request( :param reply_channel: Reply channel to be assigned to request :param model: Model to be assigned to request - :param device: Device to be assigned to request :param inputs: Inputs to be assigned to request :param outputs: Outputs to be assigned to request :param output_descriptors: Output descriptors to be assigned to request @@ -368,7 +350,6 @@ def build_request( request = request_capnp.Request.new_message() MessageHandler._assign_reply_channel(request, reply_channel) MessageHandler._assign_model(request, model) - MessageHandler._assign_device(request, device) MessageHandler._assign_inputs(request, inputs) MessageHandler._assign_outputs(request, outputs) MessageHandler._assign_output_descriptors(request, output_descriptors) diff --git a/smartsim/_core/mli/mli_schemas/request/request.capnp b/smartsim/_core/mli/mli_schemas/request/request.capnp index 446c628a4c..9387090444 100644 --- a/smartsim/_core/mli/mli_schemas/request/request.capnp +++ b/smartsim/_core/mli/mli_schemas/request/request.capnp @@ -30,12 +30,6 @@ using Tensors = import "../tensor/tensor.capnp"; using RequestAttributes = import "request_attributes/request_attributes.capnp"; using DataRef = import "../data/data_references.capnp"; -enum Device { - cpu @0; - gpu @1; - auto @2; -} - struct ChannelDescriptor { reply @0 :Data; } @@ -46,16 +40,15 @@ struct Request { modelKey @1 :DataRef.ModelKey; modelData @2 :Data; } - device @3 :Device; input :union { - inputKeys @4 :List(DataRef.TensorKey); - inputData @5 :List(Tensors.Tensor); + inputKeys @3 :List(DataRef.TensorKey); + inputData @4 :List(Tensors.Tensor); } - output @6 :List(DataRef.TensorKey); - outputDescriptors @7 :List(Tensors.OutputDescriptor); + output @5 :List(DataRef.TensorKey); + outputDescriptors @6 :List(Tensors.OutputDescriptor); customAttributes :union { - torch @8 :RequestAttributes.TorchRequestAttributes; - tf @9 :RequestAttributes.TensorFlowRequestAttributes; - none @10 :Void; + torch @7 :RequestAttributes.TorchRequestAttributes; + tf @8 :RequestAttributes.TensorFlowRequestAttributes; + none @9 :Void; } } \ No newline at end of file diff --git a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi index 5d622d4e6d..2051551550 100644 --- a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi @@ -33,8 +33,6 @@ from .request_attributes.request_attributes_capnp import ( TorchRequestAttributesReader, ) -Device = Literal["cpu", "gpu", "auto"] - class ChannelDescriptor: reply: bytes @staticmethod @@ -215,7 +213,6 @@ class Request: def write_packed(file: BufferedWriter) -> None: ... replyChannel: ChannelDescriptor | ChannelDescriptorBuilder | ChannelDescriptorReader model: Request.Model | Request.ModelBuilder | Request.ModelReader - device: Device input: Request.Input | Request.InputBuilder | Request.InputReader output: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] outputDescriptors: Sequence[ diff --git a/tests/mli/test_integrated_torch_worker.py b/tests/mli/test_integrated_torch_worker.py index 3731aabf11..60f1f0c6b9 100644 --- a/tests/mli/test_integrated_torch_worker.py +++ b/tests/mli/test_integrated_torch_worker.py @@ -66,7 +66,6 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # model_bytes = persist_torch_model.read_bytes() # input_tensor = torch.randn(2) -# expected_device = "cpu" # expected_callback_channel = b"faux_channel_descriptor_bytes" # callback_channel = mli.DragonCommChannel.find(expected_callback_channel) @@ -77,7 +76,6 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # request = MessageHandler.build_request( # reply_channel=callback_channel.descriptor, # model=model_bytes, -# device=expected_device, # inputs=[message_tensor_input], # outputs=[], # custom_attributes=None, @@ -86,7 +84,6 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # msg_bytes = MessageHandler.serialize_request(request) # inference_request = worker.deserialize(msg_bytes) -# assert inference_request.device == expected_device # assert inference_request.callback._descriptor == expected_callback_channel @@ -104,7 +101,6 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # # input_tensor = torch.randn(2) # # feature_store[input_key] = input_tensor -# expected_device = "cpu" # expected_callback_channel = b"faux_channel_descriptor_bytes" # callback_channel = mli.DragonCommChannel.find(expected_callback_channel) @@ -117,7 +113,6 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # request = MessageHandler.build_request( # reply_channel=callback_channel.descriptor, # model=message_model_key, -# device=expected_device, # inputs=[message_tensor_input_key], # outputs=[message_tensor_output_key], # custom_attributes=None, @@ -126,7 +121,6 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # msg_bytes = MessageHandler.serialize_request(request) # inference_request = worker.deserialize(msg_bytes) -# assert inference_request.device == expected_device # assert inference_request.callback._descriptor == expected_callback_channel @@ -147,7 +141,6 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # # input_tensor = torch.randn(2) # # feature_store[input_key] = input_tensor -# expected_device = "cpu" # expected_callback_channel = b"faux_channel_descriptor_bytes" # callback_channel = mli.DragonCommChannel.find(expected_callback_channel) @@ -160,7 +153,6 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # request = MessageHandler.build_request( # reply_channel=callback_channel.descriptor, # model=model_bytes, -# device=expected_device, # inputs=[message_tensor_input_key], # # outputs=[message_tensor_output_key], # outputs=[], @@ -170,7 +162,6 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # msg_bytes = MessageHandler.serialize_request(request) # inference_request = worker.deserialize(msg_bytes) -# assert inference_request.device == expected_device # assert inference_request.callback._descriptor == expected_callback_channel @@ -191,7 +182,6 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # input_tensor = torch.randn(2) # # feature_store[input_key] = input_tensor -# expected_device = "cpu" # expected_callback_channel = b"faux_channel_descriptor_bytes" # callback_channel = mli.DragonCommChannel.find(expected_callback_channel) @@ -207,7 +197,6 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # request = MessageHandler.build_request( # reply_channel=callback_channel.descriptor, # model=model_bytes, -# device=expected_device, # inputs=[message_tensor_input], # # outputs=[message_tensor_output_key], # outputs=[message_tensor_output_key], @@ -217,7 +206,6 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # msg_bytes = MessageHandler.serialize_request(request) # inference_request = worker.deserialize(msg_bytes) -# assert inference_request.device == expected_device # assert inference_request.callback._descriptor == expected_callback_channel @@ -238,7 +226,6 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # input_tensor = torch.randn(2) # # feature_store[input_key] = input_tensor -# expected_device = "cpu" # expected_callback_channel = b"faux_channel_descriptor_bytes" # callback_channel = mli.DragonCommChannel.find(expected_callback_channel) @@ -254,7 +241,6 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # request = MessageHandler.build_request( # reply_channel=callback_channel.descriptor, # model=message_model_key, -# device=expected_device, # inputs=[message_tensor_input], # # outputs=[message_tensor_output_key], # outputs=[], @@ -264,7 +250,6 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # msg_bytes = MessageHandler.serialize_request(request) # inference_request = worker.deserialize(msg_bytes) -# assert inference_request.device == expected_device # assert inference_request.callback._descriptor == expected_callback_channel diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index 3d88ee4088..01502ec521 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -122,7 +122,6 @@ def mock_messages( # working set size > 1 has side-effects # only incurs cost when working set size has been exceeded - expected_device: t.Literal["cpu", "gpu"] = "cpu" channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt" callback_channel = FileSystemCommChannel(pathlib.Path(channel_key)) @@ -144,7 +143,6 @@ def mock_messages( request = MessageHandler.build_request( reply_channel=callback_channel.descriptor, model=message_model_key, - device=expected_device, inputs=[message_tensor_input_key], outputs=[message_tensor_output_key], custom_attributes=None, diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py index d33a0376a8..c527a0086f 100644 --- a/tests/test_message_handler/test_request.py +++ b/tests/test_message_handler/test_request.py @@ -92,7 +92,6 @@ tf_indirect_request = MessageHandler.build_request( b"reply", b"model", - "cpu", [input_key1, input_key2], [output_key1, output_key2], [output_descriptor1, output_descriptor2, output_descriptor3], @@ -102,7 +101,6 @@ tf_direct_request = MessageHandler.build_request( b"reply", b"model", - "cpu", [tensor_3, tensor_4], [], [output_descriptor1, output_descriptor2], @@ -113,7 +111,6 @@ torch_indirect_request = MessageHandler.build_request( b"reply", b"model", - "cpu", [input_key1, input_key2], [output_key1, output_key2], [output_descriptor1, output_descriptor2, output_descriptor3], @@ -122,7 +119,6 @@ torch_direct_request = MessageHandler.build_request( b"reply", b"model", - "cpu", [tensor_1, tensor_2], [], [output_descriptor1, output_descriptor2], @@ -132,12 +128,11 @@ @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") @pytest.mark.parametrize( - "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + "reply_channel, model, input, output, output_descriptors, custom_attributes", [ pytest.param( b"reply channel", model_key, - "cpu", [input_key1, input_key2], [output_key1, output_key2], [output_descriptor1], @@ -146,7 +141,6 @@ pytest.param( b"another reply channel", b"model data", - "gpu", [input_key1], [output_key2], [output_descriptor1], @@ -155,7 +149,6 @@ pytest.param( b"another reply channel", b"model data", - "auto", [input_key1], [output_key2], [output_descriptor1], @@ -164,7 +157,6 @@ pytest.param( b"reply channel", model_key, - "cpu", [input_key1], [output_key1], [output_descriptor1], @@ -173,12 +165,11 @@ ], ) def test_build_request_indirect_tf_successful( - reply_channel, model, device, input, output, output_descriptors, custom_attributes + reply_channel, model, input, output, output_descriptors, custom_attributes ): built_request = MessageHandler.build_request( reply_channel, model, - device, input, output, output_descriptors, @@ -190,7 +181,6 @@ def test_build_request_indirect_tf_successful( assert built_request.model.modelKey.key == model.key else: assert built_request.model.modelData == model - assert built_request.device == device assert built_request.input.which() == "inputKeys" assert built_request.input.inputKeys[0].key == input[0].key assert len(built_request.input.inputKeys) == len(input) @@ -212,12 +202,11 @@ def test_build_request_indirect_tf_successful( @pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") @pytest.mark.parametrize( - "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + "reply_channel, model, input, output, output_descriptors, custom_attributes", [ pytest.param( b"reply channel", model_key, - "cpu", [input_key1, input_key2], [output_key1, output_key2], [output_descriptor1], @@ -226,7 +215,6 @@ def test_build_request_indirect_tf_successful( pytest.param( b"another reply channel", b"model data", - "gpu", [input_key1], [output_key2], [output_descriptor1], @@ -235,7 +223,6 @@ def test_build_request_indirect_tf_successful( pytest.param( b"another reply channel", b"model data", - "auto", [input_key1], [output_key2], [output_descriptor1], @@ -244,7 +231,6 @@ def test_build_request_indirect_tf_successful( pytest.param( b"reply channel", model_key, - "cpu", [input_key1], [output_key1], [output_descriptor1], @@ -253,12 +239,11 @@ def test_build_request_indirect_tf_successful( ], ) def test_build_request_indirect_torch_successful( - reply_channel, model, device, input, output, output_descriptors, custom_attributes + reply_channel, model, input, output, output_descriptors, custom_attributes ): built_request = MessageHandler.build_request( reply_channel, model, - device, input, output, output_descriptors, @@ -270,7 +255,6 @@ def test_build_request_indirect_torch_successful( assert built_request.model.modelKey.key == model.key else: assert built_request.model.modelData == model - assert built_request.device == device assert built_request.input.which() == "inputKeys" assert built_request.input.inputKeys[0].key == input[0].key assert len(built_request.input.inputKeys) == len(input) @@ -292,12 +276,11 @@ def test_build_request_indirect_torch_successful( @pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") @pytest.mark.parametrize( - "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + "reply_channel, model, input, output, output_descriptors, custom_attributes", [ pytest.param( [], model_key, - "cpu", [input_key1, input_key2], [output_key1, output_key2], [output_descriptor1], @@ -307,7 +290,6 @@ def test_build_request_indirect_torch_successful( pytest.param( b"reply channel", "bad model", - "gpu", [input_key1], [output_key2], [output_descriptor1], @@ -317,17 +299,6 @@ def test_build_request_indirect_torch_successful( pytest.param( b"reply channel", model_key, - "bad device", - [input_key1], - [output_key2], - [output_descriptor1], - torch_attributes, - id="bad device", - ), - pytest.param( - b"reply channel", - model_key, - "cpu", ["input_key1", "input_key2"], [output_key1, output_key2], [output_descriptor1], @@ -337,7 +308,6 @@ def test_build_request_indirect_torch_successful( pytest.param( b"reply channel", model_key, - "cpu", [model_key], [output_key1, output_key2], [output_descriptor1], @@ -347,7 +317,6 @@ def test_build_request_indirect_torch_successful( pytest.param( b"reply channel", model_key, - "cpu", [input_key1], ["output_key1", "output_key2"], [output_descriptor1], @@ -357,7 +326,6 @@ def test_build_request_indirect_torch_successful( pytest.param( b"reply channel", model_key, - "cpu", [input_key1], [model_key], [output_descriptor1], @@ -367,7 +335,6 @@ def test_build_request_indirect_torch_successful( pytest.param( b"reply channel", model_key, - "cpu", [input_key1], [output_key1, output_key2], [output_descriptor1], @@ -377,7 +344,6 @@ def test_build_request_indirect_torch_successful( pytest.param( b"reply channel", model_key, - "cpu", [input_key1], [output_key1, output_key2], [output_descriptor1], @@ -387,7 +353,6 @@ def test_build_request_indirect_torch_successful( pytest.param( b"reply channel", model_key, - "cpu", [input_key1], [output_key1, output_key2], "bad descriptors", @@ -397,13 +362,12 @@ def test_build_request_indirect_torch_successful( ], ) def test_build_request_indirect_torch_unsuccessful( - reply_channel, model, device, input, output, output_descriptors, custom_attributes + reply_channel, model, input, output, output_descriptors, custom_attributes ): with pytest.raises(ValueError): built_request = MessageHandler.build_request( reply_channel, model, - device, input, output, output_descriptors, @@ -413,12 +377,11 @@ def test_build_request_indirect_torch_unsuccessful( @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") @pytest.mark.parametrize( - "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + "reply_channel, model, input, output, output_descriptors, custom_attributes", [ pytest.param( [], model_key, - "cpu", [input_key1, input_key2], [output_key1, output_key2], [output_descriptor1], @@ -428,7 +391,6 @@ def test_build_request_indirect_torch_unsuccessful( pytest.param( b"reply channel", "bad model", - "gpu", [input_key1], [output_key2], [output_descriptor1], @@ -438,17 +400,6 @@ def test_build_request_indirect_torch_unsuccessful( pytest.param( b"reply channel", model_key, - "bad device", - [input_key1], - [output_key2], - [output_descriptor1], - tf_attributes, - id="bad device", - ), - pytest.param( - b"reply channel", - model_key, - "cpu", ["input_key1", "input_key2"], [output_key1, output_key2], [output_descriptor1], @@ -458,7 +409,6 @@ def test_build_request_indirect_torch_unsuccessful( pytest.param( b"reply channel", model_key, - "cpu", [model_key], [output_key1, output_key2], [output_descriptor1], @@ -468,7 +418,6 @@ def test_build_request_indirect_torch_unsuccessful( pytest.param( b"reply channel", model_key, - "cpu", [input_key1], ["output_key1", "output_key2"], [output_descriptor1], @@ -478,7 +427,6 @@ def test_build_request_indirect_torch_unsuccessful( pytest.param( b"reply channel", model_key, - "cpu", [input_key1], [model_key], [output_descriptor1], @@ -488,7 +436,6 @@ def test_build_request_indirect_torch_unsuccessful( pytest.param( b"reply channel", model_key, - "cpu", [input_key1], [output_key1, output_key2], [output_descriptor1], @@ -498,7 +445,6 @@ def test_build_request_indirect_torch_unsuccessful( pytest.param( b"reply channel", model_key, - "cpu", [input_key1], [output_key1, output_key2], [output_descriptor1], @@ -508,7 +454,6 @@ def test_build_request_indirect_torch_unsuccessful( pytest.param( b"reply channel", model_key, - "cpu", [input_key1], [output_key1, output_key2], "bad descriptors", @@ -518,13 +463,12 @@ def test_build_request_indirect_torch_unsuccessful( ], ) def test_build_request_indirect_tf_unsuccessful( - reply_channel, model, device, input, output, output_descriptors, custom_attributes + reply_channel, model, input, output, output_descriptors, custom_attributes ): with pytest.raises(ValueError): built_request = MessageHandler.build_request( reply_channel, model, - device, input, output, output_descriptors, @@ -534,12 +478,11 @@ def test_build_request_indirect_tf_unsuccessful( @pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") @pytest.mark.parametrize( - "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + "reply_channel, model, input, output, output_descriptors, custom_attributes", [ pytest.param( b"reply channel", model_key, - "cpu", [tensor_1, tensor_2], [], [output_descriptor2], @@ -548,7 +491,6 @@ def test_build_request_indirect_tf_unsuccessful( pytest.param( b"another reply channel", b"model data", - "gpu", [tensor_1], [], [output_descriptor3], @@ -557,7 +499,6 @@ def test_build_request_indirect_tf_unsuccessful( pytest.param( b"another reply channel", b"model data", - "auto", [tensor_2], [], [output_descriptor1], @@ -566,7 +507,6 @@ def test_build_request_indirect_tf_unsuccessful( pytest.param( b"another reply channel", b"model data", - "auto", [tensor_1], [], [output_descriptor1], @@ -575,12 +515,11 @@ def test_build_request_indirect_tf_unsuccessful( ], ) def test_build_request_direct_torch_successful( - reply_channel, model, device, input, output, output_descriptors, custom_attributes + reply_channel, model, input, output, output_descriptors, custom_attributes ): built_request = MessageHandler.build_request( reply_channel, model, - device, input, output, output_descriptors, @@ -592,7 +531,6 @@ def test_build_request_direct_torch_successful( assert built_request.model.modelKey.key == model.key else: assert built_request.model.modelData == model - assert built_request.device == device assert built_request.input.which() == "inputData" assert built_request.input.inputData[0].blob == input[0].blob assert len(built_request.input.inputData) == len(input) @@ -614,12 +552,11 @@ def test_build_request_direct_torch_successful( @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") @pytest.mark.parametrize( - "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + "reply_channel, model, input, output, output_descriptors, custom_attributes", [ pytest.param( b"reply channel", model_key, - "cpu", [tensor_3, tensor_4], [], [output_descriptor2], @@ -628,7 +565,6 @@ def test_build_request_direct_torch_successful( pytest.param( b"another reply channel", b"model data", - "gpu", [tensor_4], [], [output_descriptor3], @@ -637,7 +573,6 @@ def test_build_request_direct_torch_successful( pytest.param( b"another reply channel", b"model data", - "auto", [tensor_4], [], [output_descriptor1], @@ -646,7 +581,6 @@ def test_build_request_direct_torch_successful( pytest.param( b"another reply channel", b"model data", - "auto", [tensor_3], [], [output_descriptor1], @@ -655,12 +589,11 @@ def test_build_request_direct_torch_successful( ], ) def test_build_request_direct_tf_successful( - reply_channel, model, device, input, output, output_descriptors, custom_attributes + reply_channel, model, input, output, output_descriptors, custom_attributes ): built_request = MessageHandler.build_request( reply_channel, model, - device, input, output, output_descriptors, @@ -672,7 +605,6 @@ def test_build_request_direct_tf_successful( assert built_request.model.modelKey.key == model.key else: assert built_request.model.modelData == model - assert built_request.device == device assert built_request.input.which() == "inputData" assert built_request.input.inputData[0].blob == input[0].blob assert len(built_request.input.inputData) == len(input) @@ -694,12 +626,11 @@ def test_build_request_direct_tf_successful( @pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") @pytest.mark.parametrize( - "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + "reply_channel, model, input, output, output_descriptors, custom_attributes", [ pytest.param( [], model_key, - "cpu", [tensor_1, tensor_2], [], [output_descriptor2], @@ -709,7 +640,6 @@ def test_build_request_direct_tf_successful( pytest.param( b"reply channel", "bad model", - "gpu", [tensor_1], [], [output_descriptor2], @@ -719,17 +649,6 @@ def test_build_request_direct_tf_successful( pytest.param( b"reply channel", model_key, - "bad device", - [tensor_2], - [], - [output_descriptor2], - torch_attributes, - id="bad device", - ), - pytest.param( - b"reply channel", - model_key, - "cpu", ["input_key1", "input_key2"], [], [output_descriptor2], @@ -739,7 +658,6 @@ def test_build_request_direct_tf_successful( pytest.param( b"reply channel", model_key, - "cpu", [], ["output_key1", "output_key2"], [output_descriptor2], @@ -749,7 +667,6 @@ def test_build_request_direct_tf_successful( pytest.param( b"reply channel", model_key, - "cpu", [tensor_1], [], [output_descriptor2], @@ -759,7 +676,6 @@ def test_build_request_direct_tf_successful( pytest.param( b"reply_channel", model_key, - "cpu", [tensor_1, tensor_2], [], ["output_descriptor2"], @@ -769,13 +685,12 @@ def test_build_request_direct_tf_successful( ], ) def test_build_torch_request_direct_unsuccessful( - reply_channel, model, device, input, output, output_descriptors, custom_attributes + reply_channel, model, input, output, output_descriptors, custom_attributes ): with pytest.raises(ValueError): built_request = MessageHandler.build_request( reply_channel, model, - device, input, output, output_descriptors, @@ -785,12 +700,11 @@ def test_build_torch_request_direct_unsuccessful( @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") @pytest.mark.parametrize( - "reply_channel, model, device, input, output, output_descriptors, custom_attributes", + "reply_channel, model, input, output, output_descriptors, custom_attributes", [ pytest.param( [], model_key, - "cpu", [tensor_3, tensor_4], [], [output_descriptor2], @@ -800,7 +714,6 @@ def test_build_torch_request_direct_unsuccessful( pytest.param( b"reply channel", "bad model", - "gpu", [tensor_4], [], [output_descriptor2], @@ -810,17 +723,6 @@ def test_build_torch_request_direct_unsuccessful( pytest.param( b"reply channel", model_key, - "bad device", - [tensor_3], - [], - [output_descriptor2], - tf_attributes, - id="bad device", - ), - pytest.param( - b"reply channel", - model_key, - "cpu", ["input_key1", "input_key2"], [], [output_descriptor2], @@ -830,7 +732,6 @@ def test_build_torch_request_direct_unsuccessful( pytest.param( b"reply channel", model_key, - "cpu", [], ["output_key1", "output_key2"], [output_descriptor2], @@ -840,7 +741,6 @@ def test_build_torch_request_direct_unsuccessful( pytest.param( b"reply channel", model_key, - "cpu", [tensor_4], [], [output_descriptor2], @@ -850,7 +750,6 @@ def test_build_torch_request_direct_unsuccessful( pytest.param( b"reply_channel", model_key, - "cpu", [tensor_3, tensor_4], [], ["output_descriptor2"], @@ -860,13 +759,12 @@ def test_build_torch_request_direct_unsuccessful( ], ) def test_build_tf_request_direct_unsuccessful( - reply_channel, model, device, input, output, output_descriptors, custom_attributes + reply_channel, model, input, output, output_descriptors, custom_attributes ): with pytest.raises(ValueError): built_request = MessageHandler.build_request( reply_channel, model, - device, input, output, output_descriptors, From 8a2f1733ba2654d7c77539c93a24f30d8a8b10ea Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Wed, 3 Jul 2024 10:33:20 -0700 Subject: [PATCH 04/60] Add model metadata to request schema (#624) Add `Model` schema with model metadata. [ committed by @AlyssaCote ] [ approved by @ankona ] --- doc/changelog.md | 3 +- .../infrastructure/control/workermanager.py | 21 ++--- .../_core/mli/infrastructure/worker/worker.py | 5 +- smartsim/_core/mli/message_handler.py | 50 ++++++++--- .../mli_schemas/data/data_references.capnp | 2 +- .../mli_schemas/data/data_references_capnp.py | 26 ++++++ .../data/data_references_capnp.pyi | 26 ++++++ .../_core/mli/mli_schemas/model/model.capnp | 33 +++++++ .../mli/mli_schemas/model/model_capnp.py | 38 +++++++++ .../mli/mli_schemas/model/model_capnp.pyi | 72 ++++++++++++++++ .../mli/mli_schemas/request/request.capnp | 11 +-- .../request_attributes.capnp | 2 +- .../request_attributes_capnp.py | 26 ++++++ .../request_attributes_capnp.pyi | 26 ++++++ .../mli/mli_schemas/request/request_capnp.py | 26 ++++++ .../mli/mli_schemas/request/request_capnp.pyi | 58 ++++++++++--- .../mli/mli_schemas/response/response.capnp | 2 +- .../response_attributes.capnp | 2 +- .../response_attributes_capnp.py | 26 ++++++ .../response_attributes_capnp.pyi | 26 ++++++ .../mli_schemas/response/response_capnp.py | 26 ++++++ .../mli_schemas/response/response_capnp.pyi | 26 ++++++ .../_core/mli/mli_schemas/tensor/tensor.capnp | 2 +- .../mli/mli_schemas/tensor/tensor_capnp.py | 26 ++++++ .../mli/mli_schemas/tensor/tensor_capnp.pyi | 26 ++++++ .../test_message_handler/test_build_model.py | 72 ++++++++++++++++ tests/test_message_handler/test_request.py | 85 ++++++++++--------- 27 files changed, 657 insertions(+), 87 deletions(-) create mode 100644 smartsim/_core/mli/mli_schemas/model/model.capnp create mode 100644 smartsim/_core/mli/mli_schemas/model/model_capnp.py create mode 100644 smartsim/_core/mli/mli_schemas/model/model_capnp.pyi create mode 100644 tests/test_message_handler/test_build_model.py diff --git a/doc/changelog.md b/doc/changelog.md index 9e6fb33e17..9f85c90959 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,10 +13,11 @@ Jump to: Description +- Add Model schema with model metadata included +- Removed device from schemas, MessageHandler and tests - Add ML worker manager, sample worker, and feature store - Added schemas and MessageHandler class for de/serialization of inference requests and response messages -- Removed device from schemas, MessageHandler and tests ### Development branch diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index b113f9187e..43ff6e6799 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -43,6 +43,7 @@ from smartsim.log import get_logger if t.TYPE_CHECKING: + from smartsim._core.mli.mli_schemas.model.model_capnp import Model from smartsim._core.mli.mli_schemas.response.response_capnp import StatusEnum logger = get_logger(__name__) @@ -65,12 +66,12 @@ def deserialize_message( request = MessageHandler.deserialize_request(data_blob) # return request model_key: t.Optional[str] = None - model_bytes: t.Optional[bytes] = None + model_bytes: t.Optional[Model] = None - if request.model.which() == "modelKey": - model_key = request.model.modelKey.key - elif request.model.which() == "modelData": - model_bytes = request.model.modelData + if request.model.which() == "key": + model_key = request.model.key.key + elif request.model.which() == "data": + model_bytes = request.model.data callback_key = request.replyChannel.reply @@ -91,11 +92,11 @@ def deserialize_message( # # end client input_meta: t.List[t.Any] = [] - if request.input.which() == "inputKeys": - input_keys = [input_key.key for input_key in request.input.inputKeys] - elif request.input.which() == "inputData": - input_bytes = [data.blob for data in request.input.inputData] - input_meta = [data.tensorDescriptor for data in request.input.inputData] + if request.input.which() == "keys": + input_keys = [input_key.key for input_key in request.input.keys] + elif request.input.which() == "data": + input_bytes = [data.blob for data in request.input.data] + input_meta = [data.tensorDescriptor for data in request.input.data] inference_request = InferenceRequest( model_key=model_key, diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index c87722b290..4a4a7f899e 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -30,6 +30,7 @@ import smartsim.error as sse from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.mli_schemas.model.model_capnp import Model from smartsim.log import get_logger logger = get_logger(__name__) @@ -48,7 +49,7 @@ def __init__( input_keys: t.Optional[t.List[str]] = None, input_meta: t.Optional[t.List[t.Any]] = None, output_keys: t.Optional[t.List[str]] = None, - raw_model: t.Optional[bytes] = None, + raw_model: t.Optional[Model] = None, batch_size: int = 0, ): """Initialize the object""" @@ -158,7 +159,7 @@ def fetch_model( # model_key = hash(request.raw_model) # feature_store[model_key] = request.raw_model # short-circuit and return the directly supplied model - return FetchModelResult(request.raw_model) + return FetchModelResult(request.raw_model.data) if not request.model_key: raise sse.SmartSimError( diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index b17f359c33..16cb242b7c 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -28,6 +28,7 @@ import numpy as np from .mli_schemas.data import data_references_capnp +from .mli_schemas.model import model_capnp from .mli_schemas.request import request_capnp from .mli_schemas.request.request_attributes import request_attributes_capnp from .mli_schemas.response import response_capnp @@ -112,6 +113,25 @@ def build_tensor_key(key: str) -> data_references_capnp.TensorKey: raise ValueError("Error building tensor key.") from e return tensor_key + @staticmethod + def build_model(data: bytes, name: str, version: str) -> model_capnp.Model: + """ + Builds a new Model message with the provided data, name, and version. + + :param data: Model data + :param name: Model name + :param version: Model version + :raises ValueError: if building fails + """ + try: + model = model_capnp.Model.new_message() + model.data = data + model.name = name + model.version = version + except Exception as e: + raise ValueError("Error building model.") from e + return model + @staticmethod def build_model_key(key: str) -> data_references_capnp.ModelKey: """ @@ -187,7 +207,7 @@ def build_tf_response_attributes() -> ( @staticmethod def _assign_model( request: request_capnp.Request, - model: t.Union[data_references_capnp.ModelKey, t.ByteString], + model: t.Union[data_references_capnp.ModelKey, model_capnp.Model], ) -> None: """ Assigns a model to the supplied request. @@ -197,16 +217,20 @@ def _assign_model( :raises ValueError: if building fails """ try: - if isinstance(model, bytes): - request.model.modelData = model + class_name = model.schema.node.displayName.split(":")[-1] # type: ignore + if class_name == "Model": + request.model.data = model # type: ignore + elif class_name == "ModelKey": + request.model.key = model # type: ignore else: - request.model.modelKey = model # type: ignore + raise ValueError("""Invalid custom attribute class name. + Expected 'Model' or 'ModelKey'.""") except Exception as e: raise ValueError("Error building model portion of request.") from e @staticmethod def _assign_reply_channel( - request: request_capnp.Request, reply_channel: t.ByteString + request: request_capnp.Request, reply_channel: bytes ) -> None: """ Assigns a reply channel to the supplied request. @@ -239,9 +263,9 @@ def _assign_inputs( display_name = inputs[0].schema.node.displayName # type: ignore input_class_name = display_name.split(":")[-1] if input_class_name == "Tensor": - request.input.inputData = inputs # type: ignore + request.input.data = inputs # type: ignore elif input_class_name == "TensorKey": - request.input.inputKeys = inputs # type: ignore + request.input.keys = inputs # type: ignore else: raise ValueError( "Invalid input class name. Expected 'Tensor' or 'TensorKey'." @@ -324,8 +348,8 @@ def _assign_custom_request_attributes( @staticmethod def build_request( - reply_channel: t.ByteString, - model: t.Union[data_references_capnp.ModelKey, t.ByteString], + reply_channel: bytes, + model: t.Union[data_references_capnp.ModelKey, model_capnp.Model], inputs: t.Union[ t.List[data_references_capnp.TensorKey], t.List[tensor_capnp.Tensor] ], @@ -357,7 +381,7 @@ def build_request( return request @staticmethod - def serialize_request(request: request_capnp.RequestBuilder) -> t.ByteString: + def serialize_request(request: request_capnp.RequestBuilder) -> bytes: """ Serializes a built request message. @@ -366,7 +390,7 @@ def serialize_request(request: request_capnp.RequestBuilder) -> t.ByteString: return request.to_bytes() @staticmethod - def deserialize_request(request_bytes: t.ByteString) -> request_capnp.Request: + def deserialize_request(request_bytes: bytes) -> request_capnp.Request: """ Deserializes a serialized request message. @@ -499,14 +523,14 @@ def build_response( return response @staticmethod - def serialize_response(response: response_capnp.ResponseBuilder) -> t.ByteString: + def serialize_response(response: response_capnp.ResponseBuilder) -> bytes: """ Serializes a built response message. """ return response.to_bytes() @staticmethod - def deserialize_response(response_bytes: t.ByteString) -> response_capnp.Response: + def deserialize_response(response_bytes: bytes) -> response_capnp.Response: """ Deserializes a serialized response message. """ diff --git a/smartsim/_core/mli/mli_schemas/data/data_references.capnp b/smartsim/_core/mli/mli_schemas/data/data_references.capnp index fa35989b32..f37a957267 100644 --- a/smartsim/_core/mli/mli_schemas/data/data_references.capnp +++ b/smartsim/_core/mli/mli_schemas/data/data_references.capnp @@ -32,4 +32,4 @@ struct ModelKey { struct TensorKey { key @0 :Text; -} \ No newline at end of file +} diff --git a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.py b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.py index de3f080116..099d10c438 100644 --- a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.py +++ b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.py @@ -1,3 +1,29 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + """This is an automatically generated stub for `data_references.capnp`.""" import os diff --git a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi index 0e0edb8f99..6f775cf8f4 100644 --- a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi @@ -1,3 +1,29 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + """This is an automatically generated stub for `data_references.capnp`.""" # mypy: ignore-errors diff --git a/smartsim/_core/mli/mli_schemas/model/model.capnp b/smartsim/_core/mli/mli_schemas/model/model.capnp new file mode 100644 index 0000000000..fc9ed73663 --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/model/model.capnp @@ -0,0 +1,33 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +@0xaefb9301e14ba4bd; + +struct Model { + data @0 :Data; + name @1 :Text; + version @2 :Text; +} diff --git a/smartsim/_core/mli/mli_schemas/model/model_capnp.py b/smartsim/_core/mli/mli_schemas/model/model_capnp.py new file mode 100644 index 0000000000..be2c276c23 --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/model/model_capnp.py @@ -0,0 +1,38 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""This is an automatically generated stub for `model.capnp`.""" + +import os + +import capnp # type: ignore + +capnp.remove_import_hook() +here = os.path.dirname(os.path.abspath(__file__)) +module_file = os.path.abspath(os.path.join(here, "model.capnp")) +Model = capnp.load(module_file).Model +ModelBuilder = Model +ModelReader = Model diff --git a/smartsim/_core/mli/mli_schemas/model/model_capnp.pyi b/smartsim/_core/mli/mli_schemas/model/model_capnp.pyi new file mode 100644 index 0000000000..6ca53a3579 --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/model/model_capnp.pyi @@ -0,0 +1,72 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""This is an automatically generated stub for `model.capnp`.""" + +# mypy: ignore-errors + +from __future__ import annotations + +from contextlib import contextmanager +from io import BufferedWriter +from typing import Iterator + +class Model: + data: bytes + name: str + version: str + @staticmethod + @contextmanager + def from_bytes( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> Iterator[ModelReader]: ... + @staticmethod + def from_bytes_packed( + data: bytes, + traversal_limit_in_words: int | None = ..., + nesting_limit: int | None = ..., + ) -> ModelReader: ... + @staticmethod + def new_message() -> ModelBuilder: ... + def to_dict(self) -> dict: ... + +class ModelReader(Model): + def as_builder(self) -> ModelBuilder: ... + +class ModelBuilder(Model): + @staticmethod + def from_dict(dictionary: dict) -> ModelBuilder: ... + def copy(self) -> ModelBuilder: ... + def to_bytes(self) -> bytes: ... + def to_bytes_packed(self) -> bytes: ... + def to_segments(self) -> list[bytes]: ... + def as_reader(self) -> ModelReader: ... + @staticmethod + def write(file: BufferedWriter) -> None: ... + @staticmethod + def write_packed(file: BufferedWriter) -> None: ... diff --git a/smartsim/_core/mli/mli_schemas/request/request.capnp b/smartsim/_core/mli/mli_schemas/request/request.capnp index 9387090444..f9508cb54f 100644 --- a/smartsim/_core/mli/mli_schemas/request/request.capnp +++ b/smartsim/_core/mli/mli_schemas/request/request.capnp @@ -29,6 +29,7 @@ using Tensors = import "../tensor/tensor.capnp"; using RequestAttributes = import "request_attributes/request_attributes.capnp"; using DataRef = import "../data/data_references.capnp"; +using Models = import "../model/model.capnp"; struct ChannelDescriptor { reply @0 :Data; @@ -37,12 +38,12 @@ struct ChannelDescriptor { struct Request { replyChannel @0 :ChannelDescriptor; model :union { - modelKey @1 :DataRef.ModelKey; - modelData @2 :Data; + key @1 :DataRef.ModelKey; + data @2 :Models.Model; } input :union { - inputKeys @3 :List(DataRef.TensorKey); - inputData @4 :List(Tensors.Tensor); + keys @3 :List(DataRef.TensorKey); + data @4 :List(Tensors.Tensor); } output @5 :List(DataRef.TensorKey); outputDescriptors @6 :List(Tensors.OutputDescriptor); @@ -51,4 +52,4 @@ struct Request { tf @8 :RequestAttributes.TensorFlowRequestAttributes; none @9 :Void; } -} \ No newline at end of file +} diff --git a/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes.capnp b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes.capnp index bc1af14d12..f0a319f0a3 100644 --- a/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes.capnp +++ b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes.capnp @@ -46,4 +46,4 @@ struct TorchRequestAttributes { struct TensorFlowRequestAttributes { name @0 :Text; tensorType @1 :TFTensorType; -} \ No newline at end of file +} diff --git a/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.py b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.py index 446ee6541f..8969f38457 100644 --- a/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.py +++ b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.py @@ -1,3 +1,29 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + """This is an automatically generated stub for `request_attributes.capnp`.""" import os diff --git a/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.pyi b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.pyi index 977c3e6a09..c474de4b4f 100644 --- a/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.pyi @@ -1,3 +1,29 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + """This is an automatically generated stub for `request_attributes.capnp`.""" # mypy: ignore-errors diff --git a/smartsim/_core/mli/mli_schemas/request/request_capnp.py b/smartsim/_core/mli/mli_schemas/request/request_capnp.py index d8370b662d..90b8ce194e 100644 --- a/smartsim/_core/mli/mli_schemas/request/request_capnp.py +++ b/smartsim/_core/mli/mli_schemas/request/request_capnp.py @@ -1,3 +1,29 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + """This is an automatically generated stub for `request.capnp`.""" import os diff --git a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi index 2051551550..39093f61ad 100644 --- a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi @@ -1,3 +1,29 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + """This is an automatically generated stub for `request.capnp`.""" # mypy: ignore-errors @@ -16,6 +42,7 @@ from ..data.data_references_capnp import ( TensorKeyBuilder, TensorKeyReader, ) +from ..model.model_capnp import Model, ModelBuilder, ModelReader from ..tensor.tensor_capnp import ( OutputDescriptor, OutputDescriptorBuilder, @@ -70,10 +97,13 @@ class ChannelDescriptorBuilder(ChannelDescriptor): class Request: class Model: - modelKey: ModelKey | ModelKeyBuilder | ModelKeyReader - modelData: bytes - def which(self) -> Literal["modelKey", "modelData"]: ... - def init(self, name: Literal["modelKey"]) -> ModelKey: ... + key: ModelKey | ModelKeyBuilder | ModelKeyReader + data: Model | ModelBuilder | ModelReader + def which(self) -> Literal["key", "data"]: ... + @overload + def init(self, name: Literal["key"]) -> ModelKey: ... + @overload + def init(self, name: Literal["data"]) -> Model: ... @staticmethod @contextmanager def from_bytes( @@ -92,11 +122,13 @@ class Request: def to_dict(self) -> dict: ... class ModelReader(Request.Model): - modelKey: ModelKeyReader + key: ModelKeyReader + data: ModelReader def as_builder(self) -> Request.ModelBuilder: ... class ModelBuilder(Request.Model): - modelKey: ModelKey | ModelKeyBuilder | ModelKeyReader + key: ModelKey | ModelKeyBuilder | ModelKeyReader + data: Model | ModelBuilder | ModelReader @staticmethod def from_dict(dictionary: dict) -> Request.ModelBuilder: ... def copy(self) -> Request.ModelBuilder: ... @@ -110,9 +142,9 @@ class Request: def write_packed(file: BufferedWriter) -> None: ... class Input: - inputKeys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] - inputData: Sequence[Tensor | TensorBuilder | TensorReader] - def which(self) -> Literal["inputKeys", "inputData"]: ... + keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] + data: Sequence[Tensor | TensorBuilder | TensorReader] + def which(self) -> Literal["keys", "data"]: ... @staticmethod @contextmanager def from_bytes( @@ -131,13 +163,13 @@ class Request: def to_dict(self) -> dict: ... class InputReader(Request.Input): - inputKeys: Sequence[TensorKeyReader] - inputData: Sequence[TensorReader] + keys: Sequence[TensorKeyReader] + data: Sequence[TensorReader] def as_builder(self) -> Request.InputBuilder: ... class InputBuilder(Request.Input): - inputKeys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] - inputData: Sequence[Tensor | TensorBuilder | TensorReader] + keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] + data: Sequence[Tensor | TensorBuilder | TensorReader] @staticmethod def from_dict(dictionary: dict) -> Request.InputBuilder: ... def copy(self) -> Request.InputBuilder: ... diff --git a/smartsim/_core/mli/mli_schemas/response/response.capnp b/smartsim/_core/mli/mli_schemas/response/response.capnp index 0c5cee1a1c..67375b5a97 100644 --- a/smartsim/_core/mli/mli_schemas/response/response.capnp +++ b/smartsim/_core/mli/mli_schemas/response/response.capnp @@ -48,4 +48,4 @@ struct Response { tf @5 :ResponseAttributes.TensorFlowResponseAttributes; none @6 :Void; } -} \ No newline at end of file +} diff --git a/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes.capnp b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes.capnp index 59acd60312..b4dcf18e88 100644 --- a/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes.capnp +++ b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes.capnp @@ -30,4 +30,4 @@ struct TorchResponseAttributes { } struct TensorFlowResponseAttributes { -} \ No newline at end of file +} diff --git a/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.py b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.py index 3df1115b47..4839334d52 100644 --- a/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.py +++ b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.py @@ -1,3 +1,29 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + """This is an automatically generated stub for `response_attributes.capnp`.""" import os diff --git a/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.pyi b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.pyi index 63c2218ff4..f40688d74a 100644 --- a/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.pyi @@ -1,3 +1,29 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + """This is an automatically generated stub for `response_attributes.capnp`.""" # mypy: ignore-errors diff --git a/smartsim/_core/mli/mli_schemas/response/response_capnp.py b/smartsim/_core/mli/mli_schemas/response/response_capnp.py index 5762408272..eaa3451045 100644 --- a/smartsim/_core/mli/mli_schemas/response/response_capnp.py +++ b/smartsim/_core/mli/mli_schemas/response/response_capnp.py @@ -1,3 +1,29 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + """This is an automatically generated stub for `response.capnp`.""" import os diff --git a/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi b/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi index 194c50d1c5..f6d7f8444e 100644 --- a/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi @@ -1,3 +1,29 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + """This is an automatically generated stub for `response.capnp`.""" # mypy: ignore-errors diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp b/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp index 0097a0f9bb..aca1ce0836 100644 --- a/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp +++ b/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp @@ -77,4 +77,4 @@ struct OutputDescriptor { optionalKeys @1 :List(DataRef.TensorKey); optionalDimension @2 :List(Int32); optionalDatatype @3 :ReturnNumericalType; -} \ No newline at end of file +} diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py index a3938bda53..aa7f1e7b18 100644 --- a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py +++ b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py @@ -1,3 +1,29 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + """This is an automatically generated stub for `tensor.capnp`.""" import os diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi index 462911afdf..7e7222ef54 100644 --- a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi @@ -1,3 +1,29 @@ +# BSD 2-Clause License + +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + """This is an automatically generated stub for `tensor.capnp`.""" # mypy: ignore-errors diff --git a/tests/test_message_handler/test_build_model.py b/tests/test_message_handler/test_build_model.py new file mode 100644 index 0000000000..56c1c8764c --- /dev/null +++ b/tests/test_message_handler/test_build_model.py @@ -0,0 +1,72 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from smartsim._core.mli.message_handler import MessageHandler + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +handler = MessageHandler() + + +def test_build_model_successful(): + expected_data = b"model data" + expected_name = "model name" + expected_version = "v0.0.1" + model = handler.build_model(expected_data, expected_name, expected_version) + assert model.data == expected_data + assert model.name == expected_name + assert model.version == expected_version + + +@pytest.mark.parametrize( + "data, name, version", + [ + pytest.param( + 100, + "model name", + "v0.0.1", + id="bad data type", + ), + pytest.param( + b"model data", + 1, + "v0.0.1", + id="bad name type", + ), + pytest.param( + b"model data", + "model name", + 0.1, + id="bad version type", + ), + ], +) +def test_build_model_unsuccessful(data, name, version): + with pytest.raises(ValueError): + model = handler.build_model(data, name, version) diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py index c527a0086f..b1fedaa024 100644 --- a/tests/test_message_handler/test_request.py +++ b/tests/test_message_handler/test_request.py @@ -72,6 +72,7 @@ pytestmark = pytest.mark.group_a model_key = MessageHandler.build_model_key("model_key") +model = MessageHandler.build_model(b"model data", "model_name", "v0.0.1") input_key1 = MessageHandler.build_tensor_key("input_key1") input_key2 = MessageHandler.build_tensor_key("input_key2") @@ -91,7 +92,7 @@ if should_run_tf: tf_indirect_request = MessageHandler.build_request( b"reply", - b"model", + model, [input_key1, input_key2], [output_key1, output_key2], [output_descriptor1, output_descriptor2, output_descriptor3], @@ -100,7 +101,7 @@ tf_direct_request = MessageHandler.build_request( b"reply", - b"model", + model, [tensor_3, tensor_4], [], [output_descriptor1, output_descriptor2], @@ -110,7 +111,7 @@ if should_run_torch: torch_indirect_request = MessageHandler.build_request( b"reply", - b"model", + model, [input_key1, input_key2], [output_key1, output_key2], [output_descriptor1, output_descriptor2, output_descriptor3], @@ -118,7 +119,7 @@ ) torch_direct_request = MessageHandler.build_request( b"reply", - b"model", + model, [tensor_1, tensor_2], [], [output_descriptor1, output_descriptor2], @@ -140,7 +141,7 @@ ), pytest.param( b"another reply channel", - b"model data", + model, [input_key1], [output_key2], [output_descriptor1], @@ -148,7 +149,7 @@ ), pytest.param( b"another reply channel", - b"model data", + model, [input_key1], [output_key2], [output_descriptor1], @@ -177,13 +178,15 @@ def test_build_request_indirect_tf_successful( ) assert built_request is not None assert built_request.replyChannel.reply == reply_channel - if built_request.model.which() == "modelKey": - assert built_request.model.modelKey.key == model.key + if built_request.model.which() == "key": + assert built_request.model.key.key == model.key else: - assert built_request.model.modelData == model - assert built_request.input.which() == "inputKeys" - assert built_request.input.inputKeys[0].key == input[0].key - assert len(built_request.input.inputKeys) == len(input) + assert built_request.model.data.data == model.data + assert built_request.model.data.name == model.name + assert built_request.model.data.version == model.version + assert built_request.input.which() == "keys" + assert built_request.input.keys[0].key == input[0].key + assert len(built_request.input.keys) == len(input) assert len(built_request.output) == len(output) for i, j in zip(built_request.outputDescriptors, output_descriptors): assert i.order == j.order @@ -214,7 +217,7 @@ def test_build_request_indirect_tf_successful( ), pytest.param( b"another reply channel", - b"model data", + model, [input_key1], [output_key2], [output_descriptor1], @@ -222,7 +225,7 @@ def test_build_request_indirect_tf_successful( ), pytest.param( b"another reply channel", - b"model data", + model, [input_key1], [output_key2], [output_descriptor1], @@ -251,13 +254,15 @@ def test_build_request_indirect_torch_successful( ) assert built_request is not None assert built_request.replyChannel.reply == reply_channel - if built_request.model.which() == "modelKey": - assert built_request.model.modelKey.key == model.key + if built_request.model.which() == "key": + assert built_request.model.key.key == model.key else: - assert built_request.model.modelData == model - assert built_request.input.which() == "inputKeys" - assert built_request.input.inputKeys[0].key == input[0].key - assert len(built_request.input.inputKeys) == len(input) + assert built_request.model.data.data == model.data + assert built_request.model.data.name == model.name + assert built_request.model.data.version == model.version + assert built_request.input.which() == "keys" + assert built_request.input.keys[0].key == input[0].key + assert len(built_request.input.keys) == len(input) assert len(built_request.output) == len(output) for i, j in zip(built_request.outputDescriptors, output_descriptors): assert i.order == j.order @@ -490,7 +495,7 @@ def test_build_request_indirect_tf_unsuccessful( ), pytest.param( b"another reply channel", - b"model data", + model, [tensor_1], [], [output_descriptor3], @@ -498,7 +503,7 @@ def test_build_request_indirect_tf_unsuccessful( ), pytest.param( b"another reply channel", - b"model data", + model, [tensor_2], [], [output_descriptor1], @@ -506,7 +511,7 @@ def test_build_request_indirect_tf_unsuccessful( ), pytest.param( b"another reply channel", - b"model data", + model, [tensor_1], [], [output_descriptor1], @@ -527,13 +532,15 @@ def test_build_request_direct_torch_successful( ) assert built_request is not None assert built_request.replyChannel.reply == reply_channel - if built_request.model.which() == "modelKey": - assert built_request.model.modelKey.key == model.key + if built_request.model.which() == "key": + assert built_request.model.key.key == model.key else: - assert built_request.model.modelData == model - assert built_request.input.which() == "inputData" - assert built_request.input.inputData[0].blob == input[0].blob - assert len(built_request.input.inputData) == len(input) + assert built_request.model.data.data == model.data + assert built_request.model.data.name == model.name + assert built_request.model.data.version == model.version + assert built_request.input.which() == "data" + assert built_request.input.data[0].blob == input[0].blob + assert len(built_request.input.data) == len(input) assert len(built_request.output) == len(output) for i, j in zip(built_request.outputDescriptors, output_descriptors): assert i.order == j.order @@ -564,7 +571,7 @@ def test_build_request_direct_torch_successful( ), pytest.param( b"another reply channel", - b"model data", + model, [tensor_4], [], [output_descriptor3], @@ -572,7 +579,7 @@ def test_build_request_direct_torch_successful( ), pytest.param( b"another reply channel", - b"model data", + model, [tensor_4], [], [output_descriptor1], @@ -580,7 +587,7 @@ def test_build_request_direct_torch_successful( ), pytest.param( b"another reply channel", - b"model data", + model, [tensor_3], [], [output_descriptor1], @@ -601,13 +608,15 @@ def test_build_request_direct_tf_successful( ) assert built_request is not None assert built_request.replyChannel.reply == reply_channel - if built_request.model.which() == "modelKey": - assert built_request.model.modelKey.key == model.key + if built_request.model.which() == "key": + assert built_request.model.key.key == model.key else: - assert built_request.model.modelData == model - assert built_request.input.which() == "inputData" - assert built_request.input.inputData[0].blob == input[0].blob - assert len(built_request.input.inputData) == len(input) + assert built_request.model.data.data == model.data + assert built_request.model.data.name == model.name + assert built_request.model.data.version == model.version + assert built_request.input.which() == "data" + assert built_request.input.data[0].blob == input[0].blob + assert len(built_request.input.data) == len(input) assert len(built_request.output) == len(output) for i, j in zip(built_request.outputDescriptors, output_descriptors): assert i.order == j.order From 52abd324457bf2fc4762346bd0a2acee9e999fe5 Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Wed, 10 Jul 2024 09:05:49 -0700 Subject: [PATCH 05/60] Enable environment variable based configuration for ML Worker Manager (#621) EnvironmentConfigLoader added for ML Worker Manager. --- .github/workflows/run_tests.yml | 19 ++- Makefile | 7 +- doc/changelog.md | 3 +- pyproject.toml | 1 + .../infrastructure/control/workermanager.py | 18 ++- .../mli/infrastructure/environmentloader.py | 61 +++++++ .../storage/dragonfeaturestore.py | 4 +- tests/dragon/__init__.py | 0 tests/dragon/test_environment_loader.py | 152 ++++++++++++++++++ tests/dragon/utils/__init__.py | 0 tests/dragon/utils/featurestore.py | 128 +++++++++++++++ tests/mli/test_worker_manager.py | 24 ++- 12 files changed, 395 insertions(+), 22 deletions(-) create mode 100644 smartsim/_core/mli/infrastructure/environmentloader.py create mode 100644 tests/dragon/__init__.py create mode 100644 tests/dragon/test_environment_loader.py create mode 100644 tests/dragon/utils/__init__.py create mode 100644 tests/dragon/utils/featurestore.py diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index f3a97474d3..1db15b13e4 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -54,7 +54,7 @@ jobs: strategy: fail-fast: false matrix: - subset: [backends, slow_tests, group_a, group_b] + subset: [backends, slow_tests, group_a, group_b, dragon] os: [macos-12, macos-14, ubuntu-22.04] # Operating systems compiler: [8] # GNU compiler version rai: [1.2.7] # Redis AI versions @@ -112,9 +112,15 @@ jobs: python -m pip install .[dev,ml] - name: Install ML Runtimes with Smart (with pt, tf, and onnx support) - if: contains( matrix.os, 'ubuntu' ) || contains( matrix.os, 'macos-12') + if: (contains( matrix.os, 'ubuntu' ) || contains( matrix.os, 'macos-12')) && ( matrix.subset != 'dragon' ) run: smart build --device cpu --onnx -v + - name: Install ML Runtimes with Smart (with pt, tf, dragon, and onnx support) + if: (contains( matrix.os, 'ubuntu' ) || contains( matrix.os, 'macos-12')) && ( matrix.subset == 'dragon' ) + run: | + smart build --device cpu --onnx --dragon -v + echo "LD_LIBRARY_PATH=$(python -c 'import site; print(site.getsitepackages()[0])')/smartsim/_core/.dragon/dragon-0.9/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + - name: Install ML Runtimes with Smart (no ONNX,TF on Apple Silicon) if: contains( matrix.os, 'macos-14' ) run: smart build --device cpu --no_tf -v @@ -143,9 +149,16 @@ jobs: echo "SMARTSIM_LOG_LEVEL=debug" >> $GITHUB_ENV py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ ./tests/backends + # Run pytest (dragon subtests) + - name: Run Dragon Pytest + if: (matrix.subset == 'dragon' && matrix.os == 'ubuntu-22.04') + run: | + echo "SMARTSIM_LOG_LEVEL=debug" >> $GITHUB_ENV + dragon -s py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ -m ${{ matrix.subset }} ./tests + # Run pytest (test subsets) - name: Run Pytest - if: "!contains(matrix.subset, 'backends')" # if not running backend tests + if: (matrix.subset != 'backends' && matrix.subset != 'dragon') # if not running backend tests or dragon tests run: | echo "SMARTSIM_LOG_LEVEL=debug" >> $GITHUB_ENV py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ -m ${{ matrix.subset }} ./tests diff --git a/Makefile b/Makefile index bddbda722b..aaf1736258 100644 --- a/Makefile +++ b/Makefile @@ -164,7 +164,7 @@ tutorials-prod: # help: test - Run all tests .PHONY: test test: - @python -m pytest --ignore=tests/full_wlm/ + @python -m pytest --ignore=tests/full_wlm/ --ignore=tests/dragon # help: test-verbose - Run all tests verbosely .PHONY: test-verbose @@ -192,3 +192,8 @@ test-full: .PHONY: test-wlm test-wlm: @python -m pytest -vv tests/full_wlm/ tests/on_wlm + +# help: test-dragon - Run dragon-specific tests +.PHONY: test-dragon +test-dragon: + @dragon pytest tests/dragon diff --git a/doc/changelog.md b/doc/changelog.md index 9f85c90959..208aa7c2dd 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,10 +13,11 @@ Jump to: Description +- Add EnvironmentConfigLoader for ML Worker Manager - Add Model schema with model metadata included - Removed device from schemas, MessageHandler and tests - Add ML worker manager, sample worker, and feature store -- Added schemas and MessageHandler class for de/serialization of +- Add schemas and MessageHandler class for de/serialization of inference requests and response messages diff --git a/pyproject.toml b/pyproject.toml index 62df92f0c9..61e17891b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ markers = [ "group_a: fast test subset a", "group_b: fast test subset b", "slow_tests: tests that take a long duration to complete", + "dragon: tests that must be executed in a dragon runtime", ] [tool.isort] diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 43ff6e6799..2f7cb4ce69 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -32,6 +32,7 @@ from smartsim._core.entrypoints.service import Service from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel +from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore from smartsim._core.mli.infrastructure.worker.worker import ( InferenceReply, @@ -43,6 +44,8 @@ from smartsim.log import get_logger if t.TYPE_CHECKING: + from dragon.fli import FLInterface + from smartsim._core.mli.mli_schemas.model.model_capnp import Model from smartsim._core.mli.mli_schemas.response.response_capnp import StatusEnum @@ -162,28 +165,29 @@ class WorkerManager(Service): def __init__( self, - task_queue: "mp.Queue[bytes]", + config_loader: EnvironmentConfigLoader, worker: MachineLearningWorkerBase, - feature_store: t.Optional[FeatureStore] = None, as_service: bool = False, cooldown: int = 0, comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, ) -> None: """Initialize the WorkerManager - :param task_queue: The queue to monitor for new tasks + :param config_loader: Environment config loader that loads the task queue and + feature store :param workers: A worker to manage - :param feature_store: The persistence mechanism :param as_service: Specifies run-once or run-until-complete behavior of service - :param cooldown: Number of seconds to wait before shutting down afer + :param cooldown: Number of seconds to wait before shutting down after shutdown criteria are met :param comm_channel_type: The type of communication channel used for callbacks """ super().__init__(as_service, cooldown) """a collection of workers the manager is controlling""" - self._task_queue: "mp.Queue[bytes]" = task_queue + self._task_queue: t.Optional["FLInterface"] = config_loader.get_queue() """the queue the manager monitors for new tasks""" - self._feature_store: t.Optional[FeatureStore] = feature_store + self._feature_store: t.Optional[FeatureStore] = ( + config_loader.get_feature_store() + ) """a feature store to retrieve models from""" self._worker = worker """The ML Worker implementation""" diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py new file mode 100644 index 0000000000..267b668f63 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -0,0 +1,61 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import base64 +import os +import pickle +import typing as t + +from dragon.fli import FLInterface # pylint: disable=all + +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore + + +class EnvironmentConfigLoader: + """ + Facilitates the loading of a FeatureStore and Queue + into the WorkerManager. + """ + + def __init__(self) -> None: + self._feature_store_descriptor = os.getenv("SSFeatureStore", None) + self._queue_descriptor = os.getenv("SSQueue", None) + self.feature_store: t.Optional[FeatureStore] = None + self.queue: t.Optional["FLInterface"] = None + + def get_feature_store(self) -> t.Optional[FeatureStore]: + """Loads the Feature Store previously set in SSFeatureStore""" + if self._feature_store_descriptor is not None: + self.feature_store = pickle.loads( + base64.b64decode(self._feature_store_descriptor) + ) + return self.feature_store + + def get_queue(self) -> t.Optional["FLInterface"]: + """Returns the Queue previously set in SSQueue""" + if self._queue_descriptor is not None: + self.queue = FLInterface.attach(base64.b64decode(self._queue_descriptor)) + return self.queue diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py index ea8f06977d..8153255d0a 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -31,7 +31,7 @@ from smartsim.log import get_logger if t.TYPE_CHECKING: - from dragon.data.distdictionary.dragon_dict import DragonDict + from dragon.data.ddict.ddict import DDict logger = get_logger(__name__) @@ -40,7 +40,7 @@ class DragonFeatureStore(FeatureStore): """A feature store backed by a dragon distributed dictionary""" - def __init__(self, storage: "DragonDict") -> None: + def __init__(self, storage: "DDict") -> None: """Initialize the DragonFeatureStore instance""" self._storage = storage diff --git a/tests/dragon/__init__.py b/tests/dragon/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py new file mode 100644 index 0000000000..d339fec885 --- /dev/null +++ b/tests/dragon/test_environment_loader.py @@ -0,0 +1,152 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import base64 +import os +import pickle + +import pytest + +dragon = pytest.importorskip("dragon") + +import dragon.utils as du +from dragon.channels import Channel +from dragon.data.ddict.ddict import DDict +from dragon.fli import DragonFLIError, FLInterface + +from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( + DragonFeatureStore, +) + +from .utils.featurestore import MemoryFeatureStore + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + + +@pytest.mark.parametrize( + "content", + [ + pytest.param(b"a"), + pytest.param(b"new byte string"), + ], +) +def test_environment_loader_attach_FLI(content, monkeypatch): + """A descriptor can be stored, loaded, and reattached""" + chan = Channel.make_process_local() + queue = FLInterface(main_ch=chan) + monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) + + config = EnvironmentConfigLoader() + config_queue = config.get_queue() + + new_sender = config_queue.sendh(use_main_as_stream_channel=True) + new_sender.send_bytes(content) + + old_recv = queue.recvh(use_main_as_stream_channel=True) + result, _ = old_recv.recv_bytes() + assert result == content + + +def test_environment_loader_serialize_FLI(monkeypatch): + """The serialized descriptors of a loaded and unloaded + queue are the same""" + chan = Channel.make_process_local() + queue = FLInterface(main_ch=chan) + monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) + + config = EnvironmentConfigLoader() + config_queue = config.get_queue() + assert config_queue.serialize() == queue.serialize() + + +def test_environment_loader_FLI_fails(monkeypatch): + """An incorrect serialized descriptor will fails to attach""" + monkeypatch.setenv("SSQueue", "randomstring") + config = EnvironmentConfigLoader() + + with pytest.raises(DragonFLIError): + config_queue = config.get_queue() + + +@pytest.mark.parametrize( + "expected_keys, expected_values", + [ + pytest.param(["key1", "key2", "key3"], ["value1", "value2", "value3"]), + pytest.param(["another key"], ["another value"]), + ], +) +def test_environment_loader_memory_featurestore( + expected_keys, expected_values, monkeypatch +): + """MemoryFeatureStores can be correctly serialized and deserialized""" + feature_store = MemoryFeatureStore() + key_value_pairs = zip(expected_keys, expected_values) + for k, v in key_value_pairs: + feature_store[k] = v + monkeypatch.setenv( + "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8") + ) + config = EnvironmentConfigLoader() + config_feature_store = config.get_feature_store() + + for k, _ in key_value_pairs: + assert config_feature_store[k] == feature_store[k] + + +@pytest.mark.parametrize( + "expected_keys, expected_values", + [ + pytest.param(["key1", "key2", "key3"], ["value1", "value2", "value3"]), + pytest.param(["another key"], ["another value"]), + ], +) +def test_environment_loader_dragon_featurestore( + expected_keys, expected_values, monkeypatch +): + """DragonFeatureStores can be correctly serialized and deserialized""" + storage = DDict() + feature_store = DragonFeatureStore(storage) + key_value_pairs = zip(expected_keys, expected_values) + for k, v in key_value_pairs: + feature_store[k] = v + monkeypatch.setenv( + "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8") + ) + config = EnvironmentConfigLoader() + config_feature_store = config.get_feature_store() + + for k, _ in key_value_pairs: + assert config_feature_store[k] == feature_store[k] + + +def test_environment_variables_not_set(): + """EnvironmentConfigLoader getters return None when environment + variables are not set""" + config = EnvironmentConfigLoader() + assert config.get_feature_store() == None + assert config.get_queue() == None diff --git a/tests/dragon/utils/__init__.py b/tests/dragon/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/dragon/utils/featurestore.py b/tests/dragon/utils/featurestore.py new file mode 100644 index 0000000000..93b3134318 --- /dev/null +++ b/tests/dragon/utils/featurestore.py @@ -0,0 +1,128 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib +import typing as t + +import smartsim.error as sse +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore + + +class MemoryFeatureStore(FeatureStore): + """A feature store with values persisted only in local memory""" + + def __init__(self) -> None: + """Initialize the MemoryFeatureStore instance""" + self._storage: t.Dict[str, bytes] = {} + + def __getitem__(self, key: str) -> bytes: + """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" + if key not in self._storage: + raise sse.SmartSimError(f"{key} not found in feature store") + return self._storage[key] + + def __setitem__(self, key: str, value: bytes) -> None: + """Membership operator to test for a key existing within the feature store. + Return `True` if the key is found, `False` otherwise + :param key: Unique key of an item to retrieve from the feature store""" + self._storage[key] = value + + def __contains__(self, key: str) -> bool: + """Membership operator to test for a key existing within the feature store. + Return `True` if the key is found, `False` otherwise + :param key: Unique key of an item to retrieve from the feature store""" + return key in self._storage + + +class FileSystemFeatureStore(FeatureStore): + """Alternative feature store implementation for testing. Stores all + data on the file system""" + + def __init__(self, storage_dir: t.Optional[pathlib.Path] = None) -> None: + """Initialize the FileSystemFeatureStore instance + :param storage_dir: (optional) root directory to store all data relative to""" + self._storage_dir = storage_dir + + def __getitem__(self, key: str) -> bytes: + """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" + path = self._key_path(key) + if not path.exists(): + raise sse.SmartSimError(f"{path} not found in feature store") + return path.read_bytes() + + def __setitem__(self, key: str, value: bytes) -> None: + """Assign a value using key + :param key: Unique key of an item to set in the feature store + :param value: Value to persist in the feature store""" + path = self._key_path(key, create=True) + path.write_bytes(value) + + def __contains__(self, key: str) -> bool: + """Membership operator to test for a key existing within the feature store. + Return `True` if the key is found, `False` otherwise + :param key: Unique key of an item to retrieve from the feature store""" + path = self._key_path(key) + return path.exists() + + def _key_path(self, key: str, create: bool = False) -> pathlib.Path: + """Given a key, return a path that is optionally combined with a base + directory used by the FileSystemFeatureStore. + :param key: Unique key of an item to retrieve from the feature store""" + value = pathlib.Path(key) + + if self._storage_dir: + value = self._storage_dir / key + + if create: + value.parent.mkdir(parents=True, exist_ok=True) + + return value + + +class DragonDict: + """Mock implementation of a dragon dictionary""" + + def __init__(self) -> None: + """Initialize the mock DragonDict instance""" + self._storage: t.Dict[bytes, t.Any] = {} + + def __getitem__(self, key: bytes) -> t.Any: + """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" + return self._storage[key] + + def __setitem__(self, key: bytes, value: t.Any) -> None: + """Assign a value using key + :param key: Unique key of an item to set in the feature store + :param value: Value to persist in the feature store""" + self._storage[key] = value + + def __contains__(self, key: bytes) -> bool: + """Return `True` if the key is found, `False` otherwise + :param key: Unique key of an item to retrieve from the feature store""" + return key in self._storage diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index 01502ec521..9e9b73c4fa 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -34,7 +34,12 @@ import pytest import torch -from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager +dragon = pytest.importorskip("dragon") + +from smartsim._core.mli.infrastructure.control.workermanager import ( + EnvironmentConfigLoader, + WorkerManager, +) from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger @@ -44,8 +49,8 @@ from .worker import IntegratedTorchWorker logger = get_logger(__name__) -# The tests in this file belong to the group_b group -pytestmark = pytest.mark.group_a +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon def mock_work(worker_manager_queue: "mp.Queue[bytes]") -> None: @@ -166,14 +171,12 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: fs_path = test_path / "feature_store" comm_path = test_path / "comm_store" - work_queue: "mp.Queue[bytes]" = mp.Queue() + config_loader = EnvironmentConfigLoader() integrated_worker = IntegratedTorchWorker() - file_system_store = FileSystemFeatureStore() worker_manager = WorkerManager( - work_queue, + config_loader, integrated_worker, - file_system_store, as_service=True, cooldown=10, comm_channel_type=FileSystemCommChannel, @@ -182,7 +185,12 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: # create a mock client application to populate the request queue msg_pump = mp.Process( target=mock_messages, - args=(work_queue, file_system_store, fs_path, comm_path), + args=( + config_loader.get_queue(), + config_loader.get_feature_store(), + fs_path, + comm_path, + ), ) msg_pump.start() From eace71e73e4a1e209bb6828243607d1b39f8e964 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 15 Jul 2024 22:26:06 +0200 Subject: [PATCH 06/60] FLI-based Worker Manager (#622) This PR adds a simple `TorchWorker` which performs inference. The output transform is still not implemented, but that's something that it is not needed for the moment being. [ committed by @al-rigazzi ] [ reviewed by @AlyssaCote @ankona ] --- doc/changelog.md | 1 + ex/high_throughput_inference/mli_driver.py | 50 +++++ ex/high_throughput_inference/mock_app.py | 195 ++++++++++++++++++ .../mock_app_redis.py | 88 ++++++++ ex/high_throughput_inference/redis_driver.py | 65 ++++++ .../standalone_workermanager.py | 96 +++++++++ smartsim/_core/entrypoints/service.py | 20 +- .../_core/launcher/dragon/dragonBackend.py | 27 ++- smartsim/_core/mli/comm/channel/channel.py | 7 +- .../_core/mli/comm/channel/dragonchannel.py | 22 +- smartsim/_core/mli/comm/channel/dragonfli.py | 69 +++++++ .../infrastructure/control/workermanager.py | 134 +++++++++--- .../mli/infrastructure/environmentloader.py | 16 +- .../storage/dragonfeaturestore.py | 21 +- .../infrastructure/storage/featurestore.py | 5 +- .../mli/infrastructure/worker/torch_worker.py | 119 +++++++++++ .../_core/mli/infrastructure/worker/worker.py | 73 +++---- smartsim/_core/mli/message_handler.py | 10 +- tests/dragon/test_environment_loader.py | 7 +- tests/mli/test_torch_worker.py | 173 ++++++++++++++++ tests/mli/test_worker_manager.py | 3 +- tests/test_dragon_backend.py | 10 + 22 files changed, 1103 insertions(+), 108 deletions(-) create mode 100644 ex/high_throughput_inference/mli_driver.py create mode 100644 ex/high_throughput_inference/mock_app.py create mode 100644 ex/high_throughput_inference/mock_app_redis.py create mode 100644 ex/high_throughput_inference/redis_driver.py create mode 100644 ex/high_throughput_inference/standalone_workermanager.py create mode 100644 smartsim/_core/mli/comm/channel/dragonfli.py create mode 100644 smartsim/_core/mli/infrastructure/worker/torch_worker.py create mode 100644 tests/mli/test_torch_worker.py diff --git a/doc/changelog.md b/doc/changelog.md index 208aa7c2dd..ee41fabf88 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Add TorchWorker first implementation and mock inference app example - Add EnvironmentConfigLoader for ML Worker Manager - Add Model schema with model metadata included - Removed device from schemas, MessageHandler and tests diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py new file mode 100644 index 0000000000..6da559aa6f --- /dev/null +++ b/ex/high_throughput_inference/mli_driver.py @@ -0,0 +1,50 @@ + + +import os +import base64 +import cloudpickle +import sys +from smartsim import Experiment +from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker +from smartsim.status import TERMINAL_STATUSES +import time +import typing as t + +device = "gpu" +filedir = os.path.dirname(__file__) +worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") +app_script_name = os.path.join(filedir, "mock_app.py") +model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") + +transport: t.Literal["hsta", "tcp"] = "hsta" + +os.environ["SMARTSIM_DRAGON_TRANSPORT"] = transport + +exp_path = os.path.join(filedir, f"MLI_proto_{transport.upper()}") +os.makedirs(exp_path, exist_ok=True) +exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path) + +torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii") + +worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device, "--worker_class", torch_worker_str]) +worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) +worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) + +app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) +app = exp.create_model("app", run_settings=app_rs) +app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) + + +exp.generate(worker_manager, app, overwrite=True) +exp.start(worker_manager, app, block=False) + +while True: + if exp.get_status(app)[0] in TERMINAL_STATUSES: + exp.stop(worker_manager) + break + if exp.get_status(worker_manager)[0] in TERMINAL_STATUSES: + exp.stop(app) + break + time.sleep(5) + +print("Exiting.") \ No newline at end of file diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py new file mode 100644 index 0000000000..45246db2e5 --- /dev/null +++ b/ex/high_throughput_inference/mock_app.py @@ -0,0 +1,195 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# isort: off +import dragon +from dragon import fli +from dragon.channels import Channel +import dragon.channels +from dragon.data.ddict.ddict import DDict +from dragon.globalservices.api_setup import connect_to_infrastructure +from dragon.utils import b64decode, b64encode + +# isort: on + +import argparse +import io +import numpy +import os +import time +import torch +import numbers + +from collections import OrderedDict +from smartsim._core.mli.message_handler import MessageHandler +from smartsim.log import get_logger + +logger = get_logger("App") + +class ProtoClient: + def __init__(self, timing_on: bool): + connect_to_infrastructure() + ddict_str = os.environ["SS_DRG_DDICT"] + self._ddict = DDict.attach(ddict_str) + to_worker_fli_str = None + while to_worker_fli_str is None: + try: + to_worker_fli_str = self._ddict["to_worker_fli"] + self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str) + except KeyError: + time.sleep(1) + self._from_worker_ch = Channel.make_process_local() + self._from_worker_ch_serialized = self._from_worker_ch.serialize() + self._to_worker_ch = Channel.make_process_local() + + self._start = None + self._interm = None + self._timings: OrderedDict[str, list[numbers.Number]] = OrderedDict() + self._timing_on = timing_on + + def _add_label_to_timings(self, label: str): + if label not in self._timings: + self._timings[label] = [] + + @staticmethod + def _format_number(number: numbers.Number): + return f"{number:0.4e}" + + def start_timings(self, batch_size: int): + if self._timing_on: + self._add_label_to_timings("batch_size") + self._timings["batch_size"].append(batch_size) + self._start = time.perf_counter() + self._interm = time.perf_counter() + + def end_timings(self): + if self._timing_on: + self._add_label_to_timings("total_time") + self._timings["total_time"].append(self._format_number(time.perf_counter()-self._start)) + + def measure_time(self, label: str): + if self._timing_on: + self._add_label_to_timings(label) + self._timings[label].append(self._format_number(time.perf_counter()-self._interm)) + self._interm = time.perf_counter() + + def print_timings(self, to_file: bool = False): + print(" ".join(self._timings.keys())) + value_array = numpy.array([value for value in self._timings.values()], dtype=float) + value_array = numpy.transpose(value_array) + for i in range(value_array.shape[0]): + print(" ".join(self._format_number(value) for value in value_array[i])) + if to_file: + numpy.save("timings.npy", value_array) + numpy.savetxt("timings.txt", value_array) + + + def run_model(self, model: bytes | str, batch: torch.Tensor): + self.start_timings(batch.shape[0]) + built_tensor = MessageHandler.build_tensor( + batch.numpy(), "c", "float32", list(batch.shape)) + self.measure_time("build_tensor") + built_model = None + if isinstance(model, str): + model_arg = MessageHandler.build_model_key(model) + else: + model_arg = MessageHandler.build_model(model, "resnet-50", "1.0") + request = MessageHandler.build_request( + reply_channel=self._from_worker_ch_serialized, + model= model_arg, + inputs=[built_tensor], + outputs=[], + output_descriptors=[], + custom_attributes=None, + ) + self.measure_time("build_request") + request_bytes = MessageHandler.serialize_request(request) + self.measure_time("serialize_request") + with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: + to_sendh.send_bytes(request_bytes) + logger.info(f"Message size: {len(request_bytes)} bytes") + + self.measure_time("send") + with self._from_worker_ch.recvh(timeout=None) as from_recvh: + resp = from_recvh.recv_bytes(timeout=None) + self.measure_time("receive") + response = MessageHandler.deserialize_response(resp) + self.measure_time("deserialize_response") + result = torch.from_numpy( + numpy.frombuffer( + response.result.data[0].blob, + dtype=str(response.result.data[0].tensorDescriptor.dataType), + ) + ) + self.measure_time("deserialize_tensor") + + self.end_timings() + return result + + def set_model(self, key: str, model: bytes): + self._ddict[key] = model + + +class ResNetWrapper(): + def __init__(self, name: str, model: str): + self._model = torch.jit.load(model) + self._name = name + buffer = io.BytesIO() + scripted = torch.jit.trace(self._model, self.get_batch()) + torch.jit.save(scripted, buffer) + self._serialized_model = buffer.getvalue() + + def get_batch(self, batch_size: int=32): + return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) + + @property + def model(self): + return self._serialized_model + + @property + def name(self): + return self._name + +if __name__ == "__main__": + + parser = argparse.ArgumentParser("Mock application") + parser.add_argument("--device", default="cpu") + args = parser.parse_args() + + resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt") + + client = ProtoClient(timing_on=True) + client.set_model(resnet.name, resnet.model) + + total_iterations = 100 + + for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]: + logger.info(f"Batch size: {batch_size}") + for iteration_number in range(total_iterations + int(batch_size==1)): + logger.info(f"Iteration: {iteration_number}") + client.run_model(resnet.name, resnet.get_batch(batch_size)) + + client.print_timings(to_file=True) \ No newline at end of file diff --git a/ex/high_throughput_inference/mock_app_redis.py b/ex/high_throughput_inference/mock_app_redis.py new file mode 100644 index 0000000000..c56b4fb8b4 --- /dev/null +++ b/ex/high_throughput_inference/mock_app_redis.py @@ -0,0 +1,88 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import io +import numpy +import time +import torch +from smartsim.log import get_logger +from smartredis import Client + +logger = get_logger("App") + +class ResNetWrapper(): + def __init__(self, name: str, model: str): + self._model = torch.jit.load(model) + self._name = name + buffer = io.BytesIO() + scripted = torch.jit.trace(self._model, self.get_batch()) + torch.jit.save(scripted, buffer) + self._serialized_model = buffer.getvalue() + + def get_batch(self, batch_size: int=32): + return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) + + @property + def model(self): + return self._serialized_model + + @property + def name(self): + return self._name + +if __name__ == "__main__": + + parser = argparse.ArgumentParser("Mock application") + parser.add_argument("--device", default="cpu") + args = parser.parse_args() + + resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt") + + client = Client(cluster=False, address=None) + client.set_model(resnet.name, resnet.model, backend='TORCH', device=args.device.upper()) + + total_iterations = 100 + timings=[] + for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]: + logger.info(f"Batch size: {batch_size}") + for iteration_number in range(total_iterations + int(batch_size==1)): + timing = [batch_size] + logger.info(f"Iteration: {iteration_number}") + start = time.perf_counter() + client.put_tensor(name="batch", data=resnet.get_batch(batch_size).numpy()) + client.run_model(name=resnet.name, inputs=["batch"], outputs=["result"]) + result = client.get_tensor(name="result") + end = time.perf_counter() + timing.append(end-start) + timings.append(timing) + + + + timings_np = numpy.asarray(timings) + numpy.save("timings.npy", timings_np) + for timing in timings: + print(" ".join(str(t) for t in timing)) diff --git a/ex/high_throughput_inference/redis_driver.py b/ex/high_throughput_inference/redis_driver.py new file mode 100644 index 0000000000..ceddba4ef7 --- /dev/null +++ b/ex/high_throughput_inference/redis_driver.py @@ -0,0 +1,65 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys +from smartsim import Experiment +from smartsim.status import TERMINAL_STATUSES +import time +import typing as t + +device = "gpu" +filedir = os.path.dirname(__file__) +app_script_name = os.path.join(filedir, "mock_app_redis.py") +model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") + + +exp_path = os.path.join(filedir, "redis_ai") +os.makedirs(exp_path, exist_ok=True) +exp = Experiment("redis_ai", launcher="slurm", exp_path=exp_path) + +db = exp.create_database(interface="hsn0") + +app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) +app_rs.set_nodes(1) +app_rs.set_tasks(1) +app = exp.create_model("app", run_settings=app_rs) +app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) + +exp.generate(db, app, overwrite=True) + +exp.start(db, app, block=False) + +while True: + if exp.get_status(app)[0] in TERMINAL_STATUSES: + exp.stop(db) + break + if exp.get_status(db)[0] in TERMINAL_STATUSES: + exp.stop(app) + break + time.sleep(5) + +print("Exiting.") \ No newline at end of file diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py new file mode 100644 index 0000000000..c56e11a7c3 --- /dev/null +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -0,0 +1,96 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# isort: off +import dragon +from dragon import fli +from dragon.channels import Channel +from dragon.data.ddict.ddict import DDict +from dragon.utils import b64decode, b64encode +from dragon.globalservices.api_setup import connect_to_infrastructure +# isort: on +import argparse +import base64 +import cloudpickle +import pickle +import os + +from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import DragonFeatureStore +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel +from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker +from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager +from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Worker Manager") + parser.add_argument( + "--device", + type=str, + default="gpu", + choices="gpu cpu".split(), + help="Device on which the inference takes place", + ) + parser.add_argument( + "--worker_class", + type=str, + required=True, + help="Serialized class of worker to run", + ) + parser.add_argument( + "--num_workers", type=int, default=1, help="Number of workers to run" + ) + + args = parser.parse_args() + connect_to_infrastructure() + ddict_str = os.environ["SS_DRG_DDICT"] + ddict = DDict.attach(ddict_str) + + to_worker_channel = Channel.make_process_local() + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) + to_worker_fli_serialized = to_worker_fli.serialize() + ddict["to_worker_fli"] = to_worker_fli_serialized + + torch_worker = cloudpickle.loads(base64.b64decode(args.worker_class.encode('ascii')))() + + dfs = DragonFeatureStore(ddict) + comm_channel = DragonFLIChannel(to_worker_fli_serialized) + + os.environ["SSFeatureStore"] = base64.b64encode(pickle.dumps(dfs)).decode("utf-8") + os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + + config_loader = EnvironmentConfigLoader() + + worker_manager = WorkerManager( + config_loader=config_loader, + worker=torch_worker, + as_service=True, + cooldown=10, + comm_channel_type=DragonCommChannel, + device = args.device, + ) + worker_manager.execute() diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py index e03df6bea1..df9c2bbef6 100644 --- a/smartsim/_core/entrypoints/service.py +++ b/smartsim/_core/entrypoints/service.py @@ -46,7 +46,8 @@ def __init__( :param as_service: Determines if the host will run until shutdown criteria are met or as a run-once instance :param cooldown: Period of time to allow service to run before automatic - shutdown, in seconds. A non-zero, positive integer.""" + shutdown, in seconds. A non-zero, positive integer. + :param loop_delay: delay between iterations of the event loop""" self._as_service = as_service """If the service should run until shutdown function returns True""" self._cooldown = abs(cooldown) @@ -102,6 +103,23 @@ def execute(self) -> None: running = True cooldown_start: t.Optional[datetime.datetime] = None + headers = [ + "batch_size", + "w_deserialize", + "w_fetch_model", + "w_load_model", + "w_fetch_input", + "w_transform_input", + "w_execute", + "w_transform_output", + "w_assign_output", + "w_build_reply", + "w_serialize_resp", + "w_send", + ] + + print(",".join(headers)) + while running: self._on_iteration() diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 2456606623..dcc5c8392b 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -36,8 +36,10 @@ # pylint: disable=import-error # isort: off +import dragon.data.ddict.ddict as dragon_ddict import dragon.infrastructure.connection as dragon_connection import dragon.infrastructure.policy as dragon_policy +import dragon.infrastructure.process_desc as dragon_process_desc import dragon.native.group_state as dragon_group_state import dragon.native.process as dragon_process import dragon.native.process_group as dragon_process_group @@ -187,6 +189,7 @@ def __init__(self, pid: int) -> None: self._view = DragonBackendView(self) logger.debug(self._view.host_desc) + self._infra_ddict: t.Optional[dragon_ddict.DDict] = None @property def hosts(self) -> list[str]: @@ -391,6 +394,22 @@ def _stop_steps(self) -> None: self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED self._group_infos[step_id].return_codes = [-9] + @property + def infra_ddict(self) -> str: + """Create a Dragon distributed dictionary and return its + serialized descriptor + """ + if self._infra_ddict is None: + logger.info("Creating DDict") + self._infra_ddict = dragon_ddict.DDict( + n_nodes=len(self._hosts), total_mem=len(self._hosts) * 1024**3 + ) # todo: parametrize + logger.info("Created DDict") + self._infra_ddict["creation"] = str(time.time()) + logger.info(self._infra_ddict["creation"]) + + return str(self._infra_ddict.serialize()) + def _start_steps(self) -> None: self._heartbeat() with self._queue_lock: @@ -406,6 +425,7 @@ def _start_steps(self) -> None: placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=hosts[0], ) + options = dragon_process_desc.ProcessOptions(make_inf_channels=True) grp = dragon_process_group.ProcessGroup( restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy ) @@ -421,10 +441,15 @@ def _start_steps(self) -> None: target=request.exe, args=request.exe_args, cwd=request.path, - env={**request.current_env, **request.env}, + env={ + **request.current_env, + **request.env, + "SS_DRG_DDICT": self.infra_ddict, + }, stdout=dragon_process.Popen.PIPE, stderr=dragon_process.Popen.PIPE, policy=local_policy, + options=options, ) grp.add_process(nproc=request.tasks_per_node, template=tmp_proc) diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index 201ab9deab..2318896a9b 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -41,9 +41,14 @@ def __init__(self, descriptor: t.Union[str, bytes]) -> None: @abstractmethod def send(self, value: bytes) -> None: - """Send a message throuh the underlying communication channel + """Send a message through the underlying communication channel :param value: The value to send""" + @abstractmethod + def recv(self) -> bytes: + """Receieve a message through the underlying communication channel + :returns: the received message""" + @property def descriptor(self) -> bytes: """Return the channel descriptor for the underlying dragon channel""" diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index 4fd26861ca..1409747a91 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -24,16 +24,18 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t +import sys import smartsim._core.mli.comm.channel.channel as cch from smartsim.log import get_logger logger = get_logger(__name__) -if t.TYPE_CHECKING: +try: import dragon.channels as dch - import dragon.utils as du +except ImportError as exc: + if not "pytest" in sys.modules: + raise exc from None class DragonCommChannel(cch.CommChannelBase): @@ -42,11 +44,17 @@ class DragonCommChannel(cch.CommChannelBase): def __init__(self, key: bytes) -> None: """Initialize the DragonCommChannel instance""" super().__init__(key) - # todo: do we need memory pool information to construct the channel correctly? - self._channel: "dch.Channel" = du.get_channel(key) + self._channel: dch.Channel = dch.Channel.attach(key) def send(self, value: bytes) -> None: """Send a message throuh the underlying communication channel :param value: The value to send""" - logger.debug(f"Channel {self.descriptor.decode('utf-8')} sending message") - self._channel.send_bytes(value) + with self._channel.sendh(timeout=None) as sendh: + sendh.send_bytes(value) + + def recv(self) -> bytes: + """Receieve a message through the underlying communication channel + :returns: the received message""" + with self._channel.recvh(timeout=None) as recvh: + message_bytes: bytes = recvh.recv_bytes(timeout=None) + return message_bytes diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py new file mode 100644 index 0000000000..75f8fb4bfc --- /dev/null +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -0,0 +1,69 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# isort: off +from dragon import fli +import dragon.channels as dch + +# isort: on + +import sys +import typing as t + +import smartsim._core.mli.comm.channel.channel as cch +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class DragonFLIChannel(cch.CommChannelBase): + """Passes messages by writing to a Dragon FLI Channel""" + + def __init__(self, fli_desc: bytes, sender_supplied: bool = True) -> None: + """Initialize the DragonFLIChannel instance""" + super().__init__(fli_desc) + # todo: do we need memory pool information to construct the channel correctly? + self._fli: "fli" = fli.FLInterface.attach(fli_desc) + self._channel: t.Optional["dch"] = ( + dch.Channel.make_process_local() if sender_supplied else None + ) + + def send(self, value: bytes) -> None: + """Send a message through the underlying communication channel + :param value: The value to send""" + with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: + sendh.send_bytes(value) + + def recv(self) -> bytes: + """Receieve a message through the underlying communication channel + :returns: the received message""" + with self._fli.recvh(timeout=None) as recvh: + try: + request_bytes: bytes + request_bytes, _ = recvh.recv_bytes(timeout=None) + return request_bytes + except fli.FLIEOT as exc: + return b"" diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 2f7cb4ce69..8c06351fb5 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -24,24 +24,34 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import multiprocessing as mp +import sys + +# isort: off +import dragon +from dragon import fli + +# isort: on + +import time import typing as t import numpy as np -from smartsim._core.entrypoints.service import Service -from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel -from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore -from smartsim._core.mli.infrastructure.worker.worker import ( +from .....error import SmartSimError +from .....log import get_logger +from ....entrypoints.service import Service +from ...comm.channel.channel import CommChannelBase +from ...comm.channel.dragonchannel import DragonCommChannel +from ...infrastructure.environmentloader import EnvironmentConfigLoader +from ...infrastructure.storage.featurestore import FeatureStore +from ...infrastructure.worker.worker import ( InferenceReply, InferenceRequest, + LoadModelResult, MachineLearningWorkerBase, ) -from smartsim._core.mli.message_handler import MessageHandler -from smartsim._core.mli.mli_schemas.response.response_capnp import Response -from smartsim.log import get_logger +from ...message_handler import MessageHandler +from ...mli_schemas.response.response_capnp import Response if t.TYPE_CHECKING: from dragon.fli import FLInterface @@ -53,7 +63,9 @@ def deserialize_message( - data_blob: bytes, channel_type: t.Type[CommChannelBase] + data_blob: bytes, + channel_type: t.Type[CommChannelBase], + device: t.Literal["cpu", "gpu"], ) -> InferenceRequest: """Deserialize a message from a byte stream into an InferenceRequest :param data_blob: The byte stream to deserialize""" @@ -87,12 +99,6 @@ def deserialize_message( None # these will really be tensors already ) - # # client example - # msg = Message() - # t = torch.Tensor() - # msg.inputs = [custom_byte_converter(t)] - # mli_client.request_inference(msg) - # # end client input_meta: t.List[t.Any] = [] if request.input.which() == "keys": @@ -170,6 +176,7 @@ def __init__( as_service: bool = False, cooldown: int = 0, comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, + device: t.Literal["cpu", "gpu"] = "cpu", ) -> None: """Initialize the WorkerManager :param config_loader: Environment config loader that loads the task queue and @@ -182,8 +189,7 @@ def __init__( """ super().__init__(as_service, cooldown) - """a collection of workers the manager is controlling""" - self._task_queue: t.Optional["FLInterface"] = config_loader.get_queue() + self._task_queue: t.Optional[CommChannelBase] = config_loader.get_queue() """the queue the manager monitors for new tasks""" self._feature_store: t.Optional[FeatureStore] = ( config_loader.get_feature_store() @@ -193,6 +199,10 @@ def __init__( """The ML Worker implementation""" self._comm_channel_type = comm_channel_type """The type of communication channel to construct for callbacks""" + self._device = device + """Device on which workers need to run""" + self._cached_models: dict[str, t.Any] = {} + """Dictionary of previously loaded models""" def _validate_request(self, request: InferenceRequest) -> bool: """Ensure the request can be processed. @@ -234,24 +244,68 @@ def _on_iteration(self) -> None: logger.warning("No queue to check for tasks") return + timings = [] # timing # perform default deserialization of the message envelope - request_bytes: bytes = self._task_queue.get() + request_bytes: bytes = self._task_queue.recv() - request = deserialize_message(request_bytes, self._comm_channel_type) + interm = time.perf_counter() # timing + request = deserialize_message( + request_bytes, self._comm_channel_type, self._device + ) if not self._validate_request(request): return - # # let the worker perform additional custom deserialization - # request = self._worker.deserialize(request_bytes) + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing + + if not request.raw_model: + if request.model_key is None: + # A valid request should never get here. + raise ValueError("Could not read model key") + if request.model_key in self._cached_models: + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing + model_result = LoadModelResult(self._cached_models[request.model_key]) + + else: + fetch_model_result = None + while True: + try: + interm = time.perf_counter() # timing + fetch_model_result = self._worker.fetch_model( + request, self._feature_store + ) + except KeyError: + time.sleep(0.1) + else: + break + + if fetch_model_result is None: + raise SmartSimError("Could not retrieve model from feature store") + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing + model_result = self._worker.load_model( + request, fetch_model_result, self._device + ) + self._cached_models[request.model_key] = model_result.model + else: + fetch_model_result = self._worker.fetch_model(request, None) + model_result = self._worker.load_model( + request, fetch_result=fetch_model_result, device=self._device + ) - fetch_model_result = self._worker.fetch_model(request, self._feature_store) - model_result = self._worker.load_model(request, fetch_model_result) + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) - transformed_input = self._worker.transform_input(request, fetch_input_result) - # batch: t.Collection[_Datum] = transform_result.transformed_input - # if self._batch_size: - # batch = self._worker.batch_requests(transform_result, self._batch_size) + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing + transformed_input = self._worker.transform_input( + request, fetch_input_result, self._device + ) + + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing reply = InferenceReply() @@ -260,8 +314,14 @@ def _on_iteration(self) -> None: request, model_result, transformed_input ) - transformed_output = self._worker.transform_output(request, execute_result) + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing + transformed_output = self._worker.transform_output( + request, execute_result, self._device + ) + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing if request.output_keys: reply.output_keys = self._worker.place_output( request, transformed_output, self._feature_store @@ -272,6 +332,9 @@ def _on_iteration(self) -> None: logger.exception("Error executing worker") reply.failed = True + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing + if reply.failed: response = build_failure_reply("fail", "failure-occurred") else: @@ -280,11 +343,22 @@ def _on_iteration(self) -> None: response = build_reply(reply) + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing + # serialized = self._worker.serialize_reply(request, transformed_output) serialized_resp = MessageHandler.serialize_response(response) # type: ignore + + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing if request.callback: request.callback.send(serialized_resp) + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing + + print(" ".join(str(time) for time in timings)) # timing + def _can_shutdown(self) -> bool: """Return true when the criteria to shut down the service are met.""" # todo: determine shutdown criteria diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index 267b668f63..9f6770623d 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -31,6 +31,7 @@ from dragon.fli import FLInterface # pylint: disable=all +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore @@ -41,10 +42,12 @@ class EnvironmentConfigLoader: """ def __init__(self) -> None: - self._feature_store_descriptor = os.getenv("SSFeatureStore", None) - self._queue_descriptor = os.getenv("SSQueue", None) + self._feature_store_descriptor: t.Optional[str] = os.getenv( + "SSFeatureStore", None + ) + self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None) self.feature_store: t.Optional[FeatureStore] = None - self.queue: t.Optional["FLInterface"] = None + self.queue: t.Optional[DragonFLIChannel] = None def get_feature_store(self) -> t.Optional[FeatureStore]: """Loads the Feature Store previously set in SSFeatureStore""" @@ -54,8 +57,11 @@ def get_feature_store(self) -> t.Optional[FeatureStore]: ) return self.feature_store - def get_queue(self) -> t.Optional["FLInterface"]: + def get_queue(self, sender_supplied: bool = True) -> t.Optional[DragonFLIChannel]: """Returns the Queue previously set in SSQueue""" if self._queue_descriptor is not None: - self.queue = FLInterface.attach(base64.b64decode(self._queue_descriptor)) + self.queue = DragonFLIChannel( + fli_desc=base64.b64decode(self._queue_descriptor), + sender_supplied=sender_supplied, + ) return self.queue diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py index 8153255d0a..af592ed0ab 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -44,27 +44,28 @@ def __init__(self, storage: "DDict") -> None: """Initialize the DragonFeatureStore instance""" self._storage = storage - def __getitem__(self, key: str) -> t.Any: + def __getitem__(self, key: str) -> t.Union[str, bytes]: """Retrieve an item using key :param key: Unique key of an item to retrieve from the feature store""" - key_ = key.encode("utf-8") try: - return self._storage[key_] + value: t.Union[str, bytes] = self._storage[key] + return value + except KeyError as ex: + raise ex except Exception as ex: # note: explicitly avoid round-trip to check for key existence - raise sse.SmartSimError(f"{key} not found in feature store") from ex + raise sse.SmartSimError( + f"Could not get value for existing key {key}, error:\n{ex}" + ) from ex - def __setitem__(self, key: str, value: bytes) -> None: + def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None: """Assign a value using key :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" - key_ = key.encode("utf-8") - self._storage[key_] = value + self._storage[key] = value - def __contains__(self, key: t.Union[str, bytes]) -> bool: + def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. Return `True` if the key is found, `False` otherwise :param key: Unique key of an item to retrieve from the feature store""" - if isinstance(key, str): - key = key.encode("utf-8") return key in self._storage diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py index ec4086b732..553e13b10f 100644 --- a/smartsim/_core/mli/infrastructure/storage/featurestore.py +++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import typing as t from abc import ABC, abstractmethod @@ -32,12 +33,12 @@ class FeatureStore(ABC): values from a feature store implementation""" @abstractmethod - def __getitem__(self, key: str) -> bytes: + def __getitem__(self, key: str) -> t.Union[str, bytes]: """Retrieve an item using key :param key: Unique key of an item to retrieve from the feature store""" @abstractmethod - def __setitem__(self, key: str, value: bytes) -> None: + def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None: """Assign a value using key :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py new file mode 100644 index 0000000000..a4e725ab99 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -0,0 +1,119 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io + +import numpy as np +import torch + +from .....error import SmartSimError +from .....log import get_logger +from ...mli_schemas.tensor import tensor_capnp +from .worker import ( + ExecuteResult, + FetchInputResult, + FetchModelResult, + InferenceRequest, + LoadModelResult, + MachineLearningWorkerBase, + TransformInputResult, + TransformOutputResult, +) + +logger = get_logger(__name__) + + +class TorchWorker(MachineLearningWorkerBase): + """A worker that executes a PyTorch model.""" + + @staticmethod + def load_model( + request: InferenceRequest, fetch_result: FetchModelResult, device: str + ) -> LoadModelResult: + if fetch_result.model_bytes: + model_bytes = fetch_result.model_bytes + elif request.raw_model and request.raw_model.data: + model_bytes = request.raw_model.data + else: + raise ValueError("Unable to load model without reference object") + + device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + device = device_to_torch[device] + buffer = io.BytesIO(initial_bytes=model_bytes) + model = torch.jit.load(buffer, map_location=device) # type: ignore + result = LoadModelResult(model) + return result + + @staticmethod + def transform_input( + request: InferenceRequest, fetch_result: FetchInputResult, device: str + ) -> TransformInputResult: + result = [] + + device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + device = device_to_torch[device] + if fetch_result.meta is None: + raise ValueError("Cannot reconstruct tensor without meta information") + for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): + tensor_desc: tensor_capnp.TensorDescriptor = item_meta + result.append( + torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType))) + .to(device) + .reshape(tuple(dim for dim in tensor_desc.dimensions)) + ) + return TransformInputResult(result) + # return data # note: this fails copy test! + + @staticmethod + def execute( + request: InferenceRequest, + load_result: LoadModelResult, + transform_result: TransformInputResult, + ) -> ExecuteResult: + if not load_result.model: + raise SmartSimError("Model must be loaded to execute") + + model: torch.nn.Module = load_result.model + model.eval() + results = [model(tensor).detach() for tensor in transform_result.transformed] + + execute_result = ExecuteResult(results) + return execute_result + + @staticmethod + def transform_output( + request: InferenceRequest, + execute_result: ExecuteResult, + result_device: str, + ) -> TransformOutputResult: + if result_device != "cpu": + transformed = [item.to("cpu") for item in execute_result.predictions] + # todo: need the shape from latest schemas added here. + return TransformOutputResult(transformed, None, "c", "float32") # fixme + + return TransformOutputResult( + execute_result.predictions, None, "c", "float32" + ) # fixme diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 4a4a7f899e..900a8241de 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -27,11 +27,11 @@ import typing as t from abc import ABC, abstractmethod -import smartsim.error as sse -from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore -from smartsim._core.mli.mli_schemas.model.model_capnp import Model -from smartsim.log import get_logger +from .....error import SmartSimError +from .....log import get_logger +from ...comm.channel.channel import CommChannelBase +from ...infrastructure.storage.featurestore import FeatureStore +from ...mli_schemas.model.model_capnp import Model logger = get_logger(__name__) @@ -105,23 +105,23 @@ def __init__(self, result: t.Any) -> None: class FetchInputResult: """A wrapper around fetched inputs""" - def __init__(self, result: t.List[bytes]) -> None: + def __init__(self, result: t.List[bytes], meta: t.Optional[t.List[t.Any]]) -> None: """Initialize the object""" self.inputs = result + self.meta = meta class TransformOutputResult: """A wrapper around inference results transformed for transmission""" def __init__( - self, result: t.Any, shape: t.List[int], order: str, dtype: str + self, result: t.Any, shape: t.Optional[t.List[int]], order: str, dtype: str ) -> None: """Initialize the OutputTransformResult""" self.outputs = result self.shape = shape self.order = order self.dtype = dtype - # todo: determine if each output must have an individual (shape, order, dtype) class CreateInputBatchResult: @@ -137,7 +137,7 @@ class FetchModelResult: def __init__(self, result: bytes) -> None: """Initialize the object""" - self.model_bytes = result + self.model_bytes: bytes = result class MachineLearningWorkerCore: @@ -151,8 +151,6 @@ def fetch_model( :param request: The request that triggered the pipeline :param feature_store: The feature store used for persistence :return: Raw bytes of the model""" - if not feature_store: - raise ValueError("Feature store is required for model retrieval") if request.raw_model: # Should we cache model in the feature store? @@ -161,17 +159,20 @@ def fetch_model( # short-circuit and return the directly supplied model return FetchModelResult(request.raw_model.data) + if not feature_store: + raise ValueError("Feature store is required for model retrieval") + if not request.model_key: - raise sse.SmartSimError( + raise SmartSimError( "Key must be provided to retrieve model from feature store" ) try: - raw_bytes = feature_store[request.model_key] + raw_bytes: bytes = t.cast(bytes, feature_store[request.model_key]) return FetchModelResult(raw_bytes) except FileNotFoundError as ex: logger.exception(ex) - raise sse.SmartSimError( + raise SmartSimError( f"Model could not be retrieved with key {request.model_key}" ) from ex @@ -184,24 +185,27 @@ def fetch_inputs( :param request: The request that triggered the pipeline :param feature_store: The feature store used for persistence :return: the fetched input""" + + if request.raw_inputs: + return FetchInputResult(request.raw_inputs, request.input_meta) + if not feature_store: - raise ValueError("Feature store is required for input retrieval") + raise ValueError("No input and no feature store provided") if request.input_keys: data: t.List[bytes] = [] for input_ in request.input_keys: try: - tensor_bytes = feature_store[input_] + tensor_bytes = t.cast(bytes, feature_store[input_]) data.append(tensor_bytes) except KeyError as ex: logger.exception(ex) - raise sse.SmartSimError( + raise SmartSimError( f"Model could not be retrieved with key {input_}" ) from ex - return FetchInputResult(data) - - if request.raw_inputs: - return FetchInputResult(request.raw_inputs) + return FetchInputResult( + data, None + ) # fixme: need to get both tensor and descriptor raise ValueError("No input source") @@ -249,32 +253,26 @@ class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC): """Abstrct base class providing contract for a machine learning worker implementation.""" - # @staticmethod - # @abstractmethod - # def deserialize(request: InferenceRequest) -> InferenceRequest: - # """Given a collection of data serialized to bytes, convert the bytes - # to a proper representation used by the ML backend - # :param data_blob: inference request as a byte-serialized blob - # :return: InferenceRequest deserialized from the input""" - @staticmethod @abstractmethod def load_model( - request: InferenceRequest, fetch_result: FetchModelResult + request: InferenceRequest, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: """Given a loaded MachineLearningModel, ensure it is loaded into device memory :param request: The request that triggered the pipeline + :param device: The device on which the model must be placed :return: ModelLoadResult wrapping the model loaded for the request""" @staticmethod @abstractmethod def transform_input( - request: InferenceRequest, fetch_result: FetchInputResult + request: InferenceRequest, fetch_result: FetchInputResult, device: str ) -> TransformInputResult: """Given a collection of data, perform a transformation on the data :param request: The request that triggered the pipeline :param fetch_result: Raw output from fetching inputs out of a feature store + :param device: The device on which the transformed input must be placed :return: The transformed inputs wrapped in a InputTransformResult""" @staticmethod @@ -293,20 +291,11 @@ def execute( @staticmethod @abstractmethod def transform_output( - request: InferenceRequest, - execute_result: ExecuteResult, + request: InferenceRequest, execute_result: ExecuteResult, result_device: str ) -> TransformOutputResult: """Given inference results, perform transformations required to transmit results to the requestor. :param request: The request that triggered the pipeline :param execute_result: The result of inference wrapped in an ExecuteResult + :param result_device: The device on which the result of inference is placed :return:""" - - # @staticmethod - # @abstractmethod - # def serialize_reply( - # request: InferenceRequest, results: OutputTransformResult - # ) -> bytes: - # """Given an output, serialize to bytes for transport - # :param reply: The result of the inference pipeline - # :return: a byte-serialized version of the reply""" diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index 16cb242b7c..bcf1cfdf14 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -396,7 +396,9 @@ def deserialize_request(request_bytes: bytes) -> request_capnp.Request: :param request_bytes: Bytes to be deserialized into a Request """ - bytes_message = request_capnp.Request.from_bytes(request_bytes) + bytes_message = request_capnp.Request.from_bytes( + request_bytes, traversal_limit_in_words=2**63 + ) with bytes_message as message: return message @@ -489,7 +491,7 @@ def _assign_custom_response_attributes( response.customAttributes.tf = custom_attrs # type: ignore else: raise ValueError("""Invalid custom attribute class name. - Expected 'TensorFlowResponseAttributes' or + Expected 'TensorFlowResponseAttributes' or 'TorchResponseAttributes'.""") except Exception as e: raise ValueError("Error assigning custom attributes to response.") from e @@ -534,7 +536,9 @@ def deserialize_response(response_bytes: bytes) -> response_capnp.Response: """ Deserializes a serialized response message. """ - bytes_message = response_capnp.Response.from_bytes(response_bytes) + bytes_message = response_capnp.Response.from_bytes( + response_bytes, traversal_limit_in_words=2**63 + ) with bytes_message as message: return message diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index d339fec885..00db0a9d32 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -64,10 +64,9 @@ def test_environment_loader_attach_FLI(content, monkeypatch): config = EnvironmentConfigLoader() config_queue = config.get_queue() - new_sender = config_queue.sendh(use_main_as_stream_channel=True) - new_sender.send_bytes(content) + new_sender = config_queue.send(content) - old_recv = queue.recvh(use_main_as_stream_channel=True) + old_recv = queue.recvh() result, _ = old_recv.recv_bytes() assert result == content @@ -81,7 +80,7 @@ def test_environment_loader_serialize_FLI(monkeypatch): config = EnvironmentConfigLoader() config_queue = config.get_queue() - assert config_queue.serialize() == queue.serialize() + assert config_queue._fli.serialize() == queue.serialize() def test_environment_loader_FLI_fails(monkeypatch): diff --git a/tests/mli/test_torch_worker.py b/tests/mli/test_torch_worker.py new file mode 100644 index 0000000000..0b1cd4ccf3 --- /dev/null +++ b/tests/mli/test_torch_worker.py @@ -0,0 +1,173 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io + +import numpy as np +import pytest +import torch +from torch import nn +from torch.nn import functional as F + +from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker +from smartsim._core.mli.infrastructure.worker.worker import ( + ExecuteResult, + FetchInputResult, + FetchModelResult, + InferenceRequest, + LoadModelResult, + TransformInputResult, +) +from smartsim._core.mli.message_handler import MessageHandler +from smartsim.log import get_logger + +logger = get_logger(__name__) +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +# simple MNIST in PyTorch +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + + +torch_device = {"cpu": "cpu", "gpu": "cuda"} + + +def get_batch() -> torch.Tensor: + return torch.rand(20, 1, 28, 28) + + +def create_torch_model(): + n = Net() + example_forward_input = get_batch() + module = torch.jit.trace(n, example_forward_input) + model_buffer = io.BytesIO() + torch.jit.save(module, model_buffer) + return model_buffer.getvalue() + + +def get_request() -> InferenceRequest: + + tensors = [get_batch() for _ in range(2)] + serialized_tensors = [ + MessageHandler.build_tensor(tensor.numpy(), "c", "float32", list(tensor.shape)) + for tensor in tensors + ] + + return InferenceRequest( + model_key="model", + callback=None, + raw_inputs=[s_tensor.blob for s_tensor in serialized_tensors], + input_keys=None, + input_meta=[s_tensor.tensorDescriptor for s_tensor in serialized_tensors], + output_keys=None, + raw_model=create_torch_model(), + batch_size=0, + ) + + +sample_request: InferenceRequest = get_request() +worker = TorchWorker() + + +def test_load_model(mlutils) -> None: + fetch_model_result = FetchModelResult(sample_request.raw_model) + load_model_result = worker.load_model( + sample_request, fetch_model_result, mlutils.get_test_device().lower() + ) + + assert load_model_result.model( + get_batch().to(torch_device[mlutils.get_test_device().lower()]) + ).shape == torch.Size((20, 10)) + + +def test_transform_input(mlutils) -> None: + fetch_input_result = FetchInputResult( + sample_request.raw_inputs, sample_request.input_meta + ) + + transform_input_result = worker.transform_input( + sample_request, fetch_input_result, mlutils.get_test_device().lower() + ) + + assert all( + transformed.shape == get_batch().shape + for transformed in transform_input_result.transformed + ) + + +def test_execute(mlutils) -> None: + load_model_result = LoadModelResult( + Net().to(torch_device[mlutils.get_test_device().lower()]) + ) + transform_result = TransformInputResult( + [ + get_batch().to(torch_device[mlutils.get_test_device().lower()]) + for _ in range(2) + ] + ) + + execute_result = worker.execute(sample_request, load_model_result, transform_result) + + assert all( + result.shape == torch.Size((20, 10)) for result in execute_result.predictions + ) + + +def test_transform_output(mlutils): + execute_result = ExecuteResult([torch.rand((20, 10)) for _ in range(2)]) + + transformed_output = worker.transform_output( + sample_request, execute_result, torch_device[mlutils.get_test_device().lower()] + ) + + assert transformed_output.outputs == execute_result.predictions + assert transformed_output.shape == None + assert transformed_output.order == "c" + assert transformed_output.dtype == "float32" diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index 9e9b73c4fa..7b345f9ef1 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -29,11 +29,10 @@ import multiprocessing as mp import pathlib import time -import typing as t import pytest -import torch +torch = pytest.importorskip("torch") dragon = pytest.importorskip("dragon") from smartsim._core.mli.infrastructure.control.workermanager import ( diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py index a510f660a5..f284f38d99 100644 --- a/tests/test_dragon_backend.py +++ b/tests/test_dragon_backend.py @@ -103,6 +103,16 @@ def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": "dragon.infrastructure.connection", MagicMock(), ) + monkeypatch.setitem( + sys.modules, + "dragon.infrastructure.process_desc", + MagicMock(), + ) + monkeypatch.setitem( + sys.modules, + "dragon.data.ddict.ddict", + MagicMock(), + ) monkeypatch.setitem( sys.modules, "dragon.infrastructure.policy", From 5fac3e2334361110095dcadb8d796ef403124b36 Mon Sep 17 00:00:00 2001 From: Chris McBride <3595025+ankona@users.noreply.github.com> Date: Wed, 17 Jul 2024 13:19:14 -0400 Subject: [PATCH 07/60] Add ability to specify hardware policies on dragon run requests (#631) Adds the ability to specify hardware affinities for cpu/gpu devices. Creates a dragon policy that uses provided policy to modify the resulting dragon `ProcessGroup`. [ committed by @ankona ] [ approved by @mellis13 @al-rigazzi ] --- doc/changelog.md | 1 + doc/dragon.rst | 28 ++ .../lattice/online_analysis.ipynb | 6 + .../_core/launcher/dragon/dragonBackend.py | 85 +++- .../_core/launcher/dragon/dragonLauncher.py | 6 + smartsim/_core/launcher/step/dragonStep.py | 10 +- smartsim/_core/launcher/step/step.py | 3 +- smartsim/_core/schemas/dragonRequests.py | 41 +- smartsim/settings/dragonRunSettings.py | 32 ++ tests/test_dragon_client.py | 192 +++++++++ tests/test_dragon_launcher.py | 223 +++++++++- tests/test_dragon_run_policy.py | 371 +++++++++++++++++ ..._backend.py => test_dragon_run_request.py} | 256 +++++++++++- tests/test_dragon_run_request_nowlm.py | 105 +++++ tests/test_dragon_runsettings.py | 98 +++++ tests/test_dragon_step.py | 394 ++++++++++++++++++ 16 files changed, 1826 insertions(+), 25 deletions(-) create mode 100644 tests/test_dragon_client.py create mode 100644 tests/test_dragon_run_policy.py rename tests/{test_dragon_backend.py => test_dragon_run_request.py} (64%) create mode 100644 tests/test_dragon_run_request_nowlm.py create mode 100644 tests/test_dragon_runsettings.py create mode 100644 tests/test_dragon_step.py diff --git a/doc/changelog.md b/doc/changelog.md index ee41fabf88..820b76f0fd 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Add hardware pinning capability when using dragon - Add TorchWorker first implementation and mock inference app example - Add EnvironmentConfigLoader for ML Worker Manager - Add Model schema with model metadata included diff --git a/doc/dragon.rst b/doc/dragon.rst index 0bf6a8ea3c..e19b40e4b7 100644 --- a/doc/dragon.rst +++ b/doc/dragon.rst @@ -65,6 +65,34 @@ In the next sections, we detail how Dragon is integrated into SmartSim. For more information on HPC launchers, visit the :ref:`Run Settings` page. +Hardware Pinning +================ + +Dragon also enables users to specify hardware constraints using ``DragonRunSettings``. CPU +and GPU affinity can be specified using the ``DragonRunSettings`` object. The following +example demonstrates how to specify CPU affinity and GPU affinities simultaneously. Note +that affinities are passed as a list of device indices. + +.. code-block:: python + + # Because "dragon" was specified as the launcher during Experiment initialization, + # create_run_settings will return a DragonRunSettings object + rs = exp.create_run_settings(exe="mpi_app", + exe_args=["--option", "value"], + env_vars={"MYVAR": "VALUE"}) + + # Request the first 8 CPUs for this job + rs.set_cpu_affinity(list(range(9))) + + # Request the first two GPUs on the node for this job + rs.set_gpu_affinity([0, 1]) + +.. note:: + + SmartSim launches jobs in the order they are received on the first available + host in a round-robin pattern. To ensure a process is launched on a node with + specific features, configure a hostname constraint. + ================= The Dragon Server ================= diff --git a/doc/tutorials/online_analysis/lattice/online_analysis.ipynb b/doc/tutorials/online_analysis/lattice/online_analysis.ipynb index 412b63dd01..c5f58fa97b 100644 --- a/doc/tutorials/online_analysis/lattice/online_analysis.ipynb +++ b/doc/tutorials/online_analysis/lattice/online_analysis.ipynb @@ -378,6 +378,7 @@ }, { "cell_type": "code", + "id": "6f3ed63d-e324-443d-9b68-b2cf618d31c7", "execution_count": 7, "metadata": {}, "outputs": [ @@ -399,6 +400,7 @@ }, { "cell_type": "markdown", + "id": "96c154fe-5ca8-4d89-91f8-8fd4e75cb80e", "metadata": {}, "source": [ "We then apply the function `probe_points` to the `ux` and `uy` tensors computed in the last time step of the previous simulation. Note that all tensors are already on the DB, thus we can reference them by name. Finally, we download and plot the output (a 2D velocity field), which is stored as `probe_u` on the DB." @@ -406,6 +408,7 @@ }, { "cell_type": "code", + "id": "36e3b415-dcc1-4d25-9cce-52388146a4bb", "execution_count": 8, "metadata": {}, "outputs": [ @@ -432,6 +435,7 @@ }, { "cell_type": "markdown", + "id": "9d7e4966-a0de-480c-9556-936197a5a5d2", "metadata": {}, "source": [ "### Uploading a function inline\n", @@ -453,6 +457,7 @@ }, { "cell_type": "markdown", + "id": "1c4daf43-34d0-482a-b9b5-b3b6f1e173c4", "metadata": {}, "source": [ "We then store the function on the DB under the key `norm_function`." @@ -470,6 +475,7 @@ }, { "cell_type": "markdown", + "id": "19409ac6-e118-44db-a847-2d905fdf0331", "metadata": {}, "source": [ "Note that the key we used identifies a functional unit containing the function itself: this is similar to the key used to store the `probe` script above. When we want to run the function, we just call it with `run_script`, by indicating the `script` key as `\"norm_function\"` and the name of the function itself as `\"compute_norm\"`." diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index dcc5c8392b..2938746361 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -214,9 +214,12 @@ def group_infos(self) -> dict[str, ProcessGroupInfo]: def _initialize_hosts(self) -> None: with self._queue_lock: self._hosts: t.List[str] = sorted( - dragon_machine.Node(node).hostname - for node in dragon_machine.System().nodes + node for node in dragon_machine.System().nodes ) + self._nodes = [dragon_machine.Node(node) for node in self._hosts] + self._cpus = [node.num_cpus for node in self._nodes] + self._gpus = [node.num_gpus for node in self._nodes] + """List of hosts available in allocation""" self._free_hosts: t.Deque[str] = collections.deque(self._hosts) """List of hosts on which steps can be launched""" @@ -288,6 +291,34 @@ def current_time(self) -> float: """Current time for DragonBackend object, in seconds since the Epoch""" return time.time() + def _can_honor_policy( + self, request: DragonRunRequest + ) -> t.Tuple[bool, t.Optional[str]]: + """Check if the policy can be honored with resources available + in the allocation. + :param request: DragonRunRequest containing policy information + :returns: Tuple indicating if the policy can be honored and + an optional error message""" + # ensure the policy can be honored + if request.policy: + if request.policy.cpu_affinity: + # make sure some node has enough CPUs + available = max(self._cpus) + requested = max(request.policy.cpu_affinity) + + if requested >= available: + return False, "Cannot satisfy request, not enough CPUs available" + + if request.policy.gpu_affinity: + # make sure some node has enough GPUs + available = max(self._gpus) + requested = max(request.policy.gpu_affinity) + + if requested >= available: + return False, "Cannot satisfy request, not enough GPUs available" + + return True, None + def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str]]: """Check if request can be honored with resources available in the allocation. @@ -302,6 +333,11 @@ def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str] if self._shutdown_requested: message = "Cannot satisfy request, server is shutting down." return False, message + + honorable, err = self._can_honor_policy(request) + if not honorable: + return False, err + return True, None def _allocate_step( @@ -410,6 +446,46 @@ def infra_ddict(self) -> str: return str(self._infra_ddict.serialize()) + @staticmethod + def create_run_policy( + request: DragonRequest, node_name: str + ) -> "dragon_policy.Policy": + """Create a dragon Policy from the request and node name + :param request: DragonRunRequest containing policy information + :param node_name: Name of the node on which the process will run + :returns: dragon_policy.Policy object mapped from request properties""" + if isinstance(request, DragonRunRequest): + run_request: DragonRunRequest = request + + affinity = dragon_policy.Policy.Affinity.DEFAULT + cpu_affinity: t.List[int] = [] + gpu_affinity: t.List[int] = [] + + # Customize policy only if the client requested it, otherwise use default + if run_request.policy is not None: + # Affinities are not mutually exclusive. If specified, both are used + if run_request.policy.cpu_affinity: + affinity = dragon_policy.Policy.Affinity.SPECIFIC + cpu_affinity = run_request.policy.cpu_affinity + + if run_request.policy.gpu_affinity: + affinity = dragon_policy.Policy.Affinity.SPECIFIC + gpu_affinity = run_request.policy.gpu_affinity + + if affinity != dragon_policy.Policy.Affinity.DEFAULT: + return dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=node_name, + affinity=affinity, + cpu_affinity=cpu_affinity, + gpu_affinity=gpu_affinity, + ) + + return dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=node_name, + ) + def _start_steps(self) -> None: self._heartbeat() with self._queue_lock: @@ -432,10 +508,7 @@ def _start_steps(self) -> None: policies = [] for node_name in hosts: - local_policy = dragon_policy.Policy( - placement=dragon_policy.Policy.Placement.HOST_NAME, - host_name=node_name, - ) + local_policy = self.create_run_policy(request, node_name) policies.extend([local_policy] * request.tasks_per_node) tmp_proc = dragon_process.ProcessTemplate( target=request.exe, diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index 17b47e3090..9078fed54f 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -29,6 +29,8 @@ import os import typing as t +from smartsim._core.schemas.dragonRequests import DragonRunPolicy + from ...._core.launcher.stepMapping import StepMap from ....error import LauncherError, SmartSimError from ....log import get_logger @@ -168,6 +170,9 @@ def run(self, step: Step) -> t.Optional[str]: merged_env = self._connector.merge_persisted_env(os.environ.copy()) nodes = int(run_args.get("nodes", None) or 1) tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) + + policy = DragonRunPolicy.from_run_args(run_args) + response = _assert_schema_type( self._connector.send_request( DragonRunRequest( @@ -181,6 +186,7 @@ def run(self, step: Step) -> t.Optional[str]: current_env=merged_env, output_file=out, error_file=err, + policy=policy, ) ), DragonRunResponse, diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py index 036a9e5654..dd93d7910c 100644 --- a/smartsim/_core/launcher/step/dragonStep.py +++ b/smartsim/_core/launcher/step/dragonStep.py @@ -30,7 +30,11 @@ import sys import typing as t -from ...._core.schemas.dragonRequests import DragonRunRequest, request_registry +from ...._core.schemas.dragonRequests import ( + DragonRunPolicy, + DragonRunRequest, + request_registry, +) from ....error.errors import SSUnsupportedError from ....log import get_logger from ....settings import ( @@ -166,8 +170,11 @@ def _write_request_file(self) -> str: nodes = int(run_args.get("nodes", None) or 1) tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) + policy = DragonRunPolicy.from_run_args(run_args) + cmd = step.get_launch_cmd() out, err = step.get_output_files() + request = DragonRunRequest( exe=cmd[0], exe_args=cmd[1:], @@ -179,6 +186,7 @@ def _write_request_file(self) -> str: current_env=os.environ, output_file=out, error_file=err, + policy=policy, ) requests.append(request_registry.to_string(request)) with open(request_file, "w", encoding="utf-8") as script_file: diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index 2cce6e6107..171254e32a 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -26,6 +26,7 @@ from __future__ import annotations +import copy import functools import os.path as osp import pathlib @@ -51,7 +52,7 @@ def __init__(self, name: str, cwd: str, step_settings: SettingsBase) -> None: self.entity_name = name self.cwd = cwd self.managed = False - self.step_settings = step_settings + self.step_settings = copy.deepcopy(step_settings) self.meta: t.Dict[str, str] = {} @property diff --git a/smartsim/_core/schemas/dragonRequests.py b/smartsim/_core/schemas/dragonRequests.py index 3e384f746a..487ea915a0 100644 --- a/smartsim/_core/schemas/dragonRequests.py +++ b/smartsim/_core/schemas/dragonRequests.py @@ -26,9 +26,10 @@ import typing as t -from pydantic import BaseModel, Field, PositiveInt +from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt, ValidationError import smartsim._core.schemas.utils as _utils +from smartsim.error.errors import SmartSimError # Black and Pylint disagree about where to put the `...` # pylint: disable=multiple-statements @@ -39,6 +40,43 @@ class DragonRequest(BaseModel): ... +class DragonRunPolicy(BaseModel): + """Policy specifying hardware constraints when running a Dragon job""" + + cpu_affinity: t.List[NonNegativeInt] = Field(default_factory=list) + """List of CPU indices to which the job should be pinned""" + gpu_affinity: t.List[NonNegativeInt] = Field(default_factory=list) + """List of GPU indices to which the job should be pinned""" + + @staticmethod + def from_run_args( + run_args: t.Dict[str, t.Union[int, str, float, None]] + ) -> "DragonRunPolicy": + """Create a DragonRunPolicy with hardware constraints passed from + a dictionary of run arguments + :param run_args: Dictionary of run arguments + :returns: DragonRunPolicy instance created from the run arguments""" + gpu_args = "" + if gpu_arg_value := run_args.get("gpu-affinity", None): + gpu_args = str(gpu_arg_value) + + cpu_args = "" + if cpu_arg_value := run_args.get("cpu-affinity", None): + cpu_args = str(cpu_arg_value) + + # run args converted to a string must be split back into a list[int] + gpu_affinity = [int(x.strip()) for x in gpu_args.split(",") if x] + cpu_affinity = [int(x.strip()) for x in cpu_args.split(",") if x] + + try: + return DragonRunPolicy( + cpu_affinity=cpu_affinity, + gpu_affinity=gpu_affinity, + ) + except ValidationError as ex: + raise SmartSimError("Unable to build DragonRunPolicy") from ex + + class DragonRunRequestView(DragonRequest): exe: t.Annotated[str, Field(min_length=1)] exe_args: t.List[t.Annotated[str, Field(min_length=1)]] = [] @@ -57,6 +95,7 @@ class DragonRunRequestView(DragonRequest): @request_registry.register("run") class DragonRunRequest(DragonRunRequestView): current_env: t.Dict[str, t.Optional[str]] = {} + policy: t.Optional[DragonRunPolicy] = None def __str__(self) -> str: return str(DragonRunRequestView.parse_obj(self.dict(exclude={"current_env"}))) diff --git a/smartsim/settings/dragonRunSettings.py b/smartsim/settings/dragonRunSettings.py index b8baa4708c..69a91547e7 100644 --- a/smartsim/settings/dragonRunSettings.py +++ b/smartsim/settings/dragonRunSettings.py @@ -28,6 +28,8 @@ import typing as t +from typing_extensions import override + from ..log import get_logger from .base import RunSettings @@ -63,6 +65,7 @@ def __init__( **kwargs, ) + @override def set_nodes(self, nodes: int) -> None: """Set the number of nodes @@ -70,9 +73,38 @@ def set_nodes(self, nodes: int) -> None: """ self.run_args["nodes"] = nodes + @override def set_tasks_per_node(self, tasks_per_node: int) -> None: """Set the number of tasks for this job :param tasks_per_node: number of tasks per node """ self.run_args["tasks-per-node"] = tasks_per_node + + @override + def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None: + """Specify the node feature for this job + + :param feature_list: a collection of strings representing the required + node features. Currently supported node features are: "gpu" + """ + if isinstance(feature_list, str): + feature_list = feature_list.strip().split() + elif not all(isinstance(feature, str) for feature in feature_list): + raise TypeError("feature_list must be string or list of strings") + + self.run_args["node-feature"] = ",".join(feature_list) + + def set_cpu_affinity(self, devices: t.List[int]) -> None: + """Set the CPU affinity for this job + + :param devices: list of CPU indices to execute on + """ + self.run_args["cpu-affinity"] = ",".join(str(device) for device in devices) + + def set_gpu_affinity(self, devices: t.List[int]) -> None: + """Set the GPU affinity for this job + + :param devices: list of GPU indices to execute on. + """ + self.run_args["gpu-affinity"] = ",".join(str(device) for device in devices) diff --git a/tests/test_dragon_client.py b/tests/test_dragon_client.py new file mode 100644 index 0000000000..80257b6107 --- /dev/null +++ b/tests/test_dragon_client.py @@ -0,0 +1,192 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os +import pathlib +import typing as t +from unittest.mock import MagicMock + +import pytest + +from smartsim._core.launcher.step.dragonStep import DragonBatchStep, DragonStep +from smartsim.settings import DragonRunSettings +from smartsim.settings.slurmSettings import SbatchSettings + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +import smartsim._core.entrypoints.dragon_client as dragon_client +from smartsim._core.schemas.dragonRequests import * +from smartsim._core.schemas.dragonResponses import * + + +@pytest.fixture +def dragon_batch_step(test_dir: str) -> "DragonBatchStep": + """Fixture for creating a default batch of steps for a dragon launcher""" + test_path = pathlib.Path(test_dir) + + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + # ensure the status_dir is set + status_dir = (test_path / ".smartsim" / "logs").as_posix() + batch_step.meta["status_dir"] = status_dir + + # create some steps to verify the requests file output changes + rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) + rs1 = DragonRunSettings(exe="sleep", exe_args=["2"]) + rs2 = DragonRunSettings(exe="sleep", exe_args=["3"]) + rs3 = DragonRunSettings(exe="sleep", exe_args=["4"]) + + names = "test00", "test01", "test02", "test03" + settings = rs0, rs1, rs2, rs3 + + # create steps with: + # no affinity, cpu affinity only, gpu affinity only, cpu and gpu affinity + cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] + gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] + + # assign some unique affinities to each run setting instance + for index, rs in enumerate(settings): + if gpu_affinities[index]: + rs.set_node_feature("gpu") + rs.set_cpu_affinity(cpu_affinities[index]) + rs.set_gpu_affinity(gpu_affinities[index]) + + steps = list( + DragonStep(name_, test_dir, rs_) for name_, rs_ in zip(names, settings) + ) + + for index, step in enumerate(steps): + # ensure meta is configured... + step.meta["status_dir"] = status_dir + # ... and put all the steps into the batch + batch_step.add_to_batch(steps[index]) + + return batch_step + + +def get_request_path_from_batch_script(launch_cmd: t.List[str]) -> pathlib.Path: + """Helper method for finding the path to a request file from the launch command""" + script_path = pathlib.Path(launch_cmd[-1]) + batch_script = script_path.read_text(encoding="utf-8") + batch_statements = [line for line in batch_script.split("\n") if line] + entrypoint_cmd = batch_statements[-1] + requests_file = pathlib.Path(entrypoint_cmd.split()[-1]) + return requests_file + + +def test_dragon_client_main_no_arg(monkeypatch: pytest.MonkeyPatch): + """Verify the client fails when the path to a submission file is not provided.""" + with pytest.raises(SystemExit): + dragon_client.cleanup = MagicMock() + dragon_client.main([]) + + # arg parser failures occur before resource allocation and should + # not result in resource cleanup being called + assert not dragon_client.cleanup.called + + +def test_dragon_client_main_empty_arg(test_dir: str): + """Verify the client fails when the path to a submission file is empty.""" + + with pytest.raises(ValueError) as ex: + dragon_client.cleanup = MagicMock() + dragon_client.main(["+submit", ""]) + + # verify it's a value error related to submit argument + assert "file not provided" in ex.value.args[0] + + # arg parser failures occur before resource allocation and should + # not result in resource cleanup being called + assert not dragon_client.cleanup.called + + +def test_dragon_client_main_bad_arg(test_dir: str): + """Verify the client returns a failure code when the path to a submission file is + invalid and does not raise an exception""" + path = pathlib.Path(test_dir) / "nonexistent_file.json" + + dragon_client.cleanup = MagicMock() + return_code = dragon_client.main(["+submit", str(path)]) + + # ensure non-zero return code + assert return_code != 0 + + # ensure failures do not block resource cleanup + assert dragon_client.cleanup.called + + +def test_dragon_client_main( + dragon_batch_step: DragonBatchStep, monkeypatch: pytest.MonkeyPatch +): + """Verify the client returns a failure code when the path to a submission file is + invalid and does not raise an exception""" + launch_cmd = dragon_batch_step.get_launch_cmd() + path = get_request_path_from_batch_script(launch_cmd) + num_requests_in_batch = 4 + num_shutdown_requests = 1 + request_count = num_requests_in_batch + num_shutdown_requests + submit_value = str(path) + + mock_connector = MagicMock() # DragonConnector + mock_connector.is_connected = True + mock_connector.send_request.return_value = DragonRunResponse(step_id="mock_step_id") + # mock can_monitor to exit before the infinite loop checking for shutdown + mock_connector.can_monitor = False + + mock_connector_class = MagicMock() + mock_connector_class.return_value = mock_connector + + # with monkeypatch.context() as ctx: + dragon_client.DragonConnector = mock_connector_class + dragon_client.cleanup = MagicMock() + + return_code = dragon_client.main(["+submit", submit_value]) + + # verify each request in the request file was processed + assert mock_connector.send_request.call_count == request_count + + # we know the batch fixture has a step with no affinity args supplied. skip it + for i in range(1, num_requests_in_batch): + sent_args = mock_connector.send_request.call_args_list[i][0] + request_arg = sent_args[0] + + assert isinstance(request_arg, DragonRunRequest) + + policy = request_arg.policy + + # make sure each policy has been read in correctly with valid affinity indices + assert len(policy.cpu_affinity) == len(set(policy.cpu_affinity)) + assert len(policy.gpu_affinity) == len(set(policy.gpu_affinity)) + + # we get a non-zero due to avoiding the infinite loop. consider refactoring + assert return_code == os.EX_IOERR + + # ensure failures do not block resource cleanup + assert dragon_client.cleanup.called diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index ee0fcb14b7..4fe8bf71b4 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -31,6 +31,7 @@ import sys import time import typing as t +from unittest.mock import MagicMock import pytest import zmq @@ -38,15 +39,74 @@ import smartsim._core.config from smartsim._core._cli.scripts.dragon_install import create_dotenv from smartsim._core.config.config import get_config -from smartsim._core.launcher.dragon.dragonLauncher import DragonConnector +from smartsim._core.launcher.dragon.dragonLauncher import ( + DragonConnector, + DragonLauncher, +) from smartsim._core.launcher.dragon.dragonSockets import ( get_authenticator, get_secure_socket, ) +from smartsim._core.launcher.step.dragonStep import DragonBatchStep, DragonStep from smartsim._core.schemas.dragonRequests import DragonBootstrapRequest -from smartsim._core.schemas.dragonResponses import DragonHandshakeResponse +from smartsim._core.schemas.dragonResponses import ( + DragonHandshakeResponse, + DragonRunResponse, +) from smartsim._core.utils.network import IFConfig, find_free_port from smartsim._core.utils.security import KeyManager +from smartsim.error.errors import LauncherError +from smartsim.settings.dragonRunSettings import DragonRunSettings +from smartsim.settings.slurmSettings import SbatchSettings + + +@pytest.fixture +def dragon_batch_step(test_dir: str) -> DragonBatchStep: + """Fixture for creating a default batch of steps for a dragon launcher""" + test_path = pathlib.Path(test_dir) + + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + # ensure the status_dir is set + status_dir = (test_path / ".smartsim" / "logs").as_posix() + batch_step.meta["status_dir"] = status_dir + + # create some steps to verify the requests file output changes + rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) + rs1 = DragonRunSettings(exe="sleep", exe_args=["2"]) + rs2 = DragonRunSettings(exe="sleep", exe_args=["3"]) + rs3 = DragonRunSettings(exe="sleep", exe_args=["4"]) + + names = "test00", "test01", "test02", "test03" + settings = rs0, rs1, rs2, rs3 + + # create steps with: + # no affinity, cpu affinity only, gpu affinity only, cpu and gpu affinity + cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] + gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] + + # assign some unique affinities to each run setting instance + for index, rs in enumerate(settings): + if gpu_affinities[index]: + rs.set_node_feature("gpu") + rs.set_cpu_affinity(cpu_affinities[index]) + rs.set_gpu_affinity(gpu_affinities[index]) + + steps = list( + DragonStep(name_, test_dir, rs_) for name_, rs_ in zip(names, settings) + ) + + for index, step in enumerate(steps): + # ensure meta is configured... + step.meta["status_dir"] = status_dir + # ... and put all the steps into the batch + batch_step.add_to_batch(steps[index]) + + return batch_step + # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -521,3 +581,162 @@ def test_merge_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): # any non-dragon keys that didn't exist avoid unnecessary prepending assert merged_env[non_dragon_key] == non_dragon_value + + +def test_run_step_fail(test_dir: str) -> None: + """Verify that the dragon launcher still returns the step id + when the running step fails""" + test_path = pathlib.Path(test_dir) + status_dir = (test_path / ".smartsim" / "logs").as_posix() + + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + step0 = DragonStep("step0", test_dir, rs) + step0.meta["status_dir"] = status_dir + + mock_connector = MagicMock() # DragonConnector() + mock_connector.is_connected = True + mock_connector.send_request = MagicMock( + return_value=DragonRunResponse(step_id=step0.name, error_message="mock fail!") + ) + + launcher = DragonLauncher() + launcher._connector = mock_connector + + result = launcher.run(step0) + + # verify the failed step name is in the result + assert step0.name in result + + +def test_run_step_batch_empty(dragon_batch_step: DragonBatchStep) -> None: + """Verify that the dragon launcher behaves when asked to execute + a batch step that has no sub-steps""" + # remove the steps added in the batch fixture + dragon_batch_step.steps.clear() + + mock_step_id = "MOCK-STEPID" + mock_connector = MagicMock() # DragonConnector() + mock_connector.is_connected = True + mock_connector.send_request = MagicMock( + return_value=DragonRunResponse( + step_id=dragon_batch_step.name, error_message="mock fail!" + ) + ) + + launcher = DragonLauncher() + launcher._connector = mock_connector + launcher.task_manager.start_and_wait = MagicMock(return_value=(0, mock_step_id, "")) + + result = launcher.run(dragon_batch_step) + + # verify a step name is returned + assert result + # verify the batch step name is not in the result (renamed to SLURM-*) + assert dragon_batch_step.name not in result + + send_invocation = mock_connector.send_request + + # verify a batch request is not sent through the dragon connector + send_invocation.assert_not_called() + + +def test_run_step_batch_failure(dragon_batch_step: DragonBatchStep) -> None: + """Verify that the dragon launcher sends returns the step id + when the running step fails""" + mock_connector = MagicMock() # DragonConnector() + mock_connector.is_connected = True + mock_connector.send_request = MagicMock( + return_value=DragonRunResponse( + step_id=dragon_batch_step.name, error_message="mock fail!" + ) + ) + + mock_step_id = "MOCK-STEPID" + error_msg = "DOES_NOT_COMPUTE!" + launcher = DragonLauncher() + launcher._connector = mock_connector + launcher.task_manager.start_and_wait = MagicMock( + return_value=(1, mock_step_id, error_msg) + ) + + # a non-zero return code from the batch script should raise an error + with pytest.raises(LauncherError) as ex: + launcher.run(dragon_batch_step) + + # verify the correct error message is in the exception + assert error_msg in ex.value.args[0] + + +def test_run_step_success(test_dir: str) -> None: + """Verify that the dragon launcher sends the correctly formatted request for a step""" + test_path = pathlib.Path(test_dir) + status_dir = (test_path / ".smartsim" / "logs").as_posix() + + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + step0 = DragonStep("step0", test_dir, rs) + step0.meta["status_dir"] = status_dir + + mock_connector = MagicMock() # DragonConnector() + mock_connector.is_connected = True + mock_connector.send_request = MagicMock( + return_value=DragonRunResponse(step_id=step0.name) + ) + + launcher = DragonLauncher() + launcher._connector = mock_connector + + result = launcher.run(step0) + + # verify the successfully executed step name is in the result + assert step0.name in result + + # verify the DragonRunRequest sent matches all expectations + send_invocation = mock_connector.send_request + send_invocation.assert_called_once() + + args = send_invocation.call_args[0] # call_args == t.Tuple[args, kwargs] + + dragon_run_request = args[0] + req_name = dragon_run_request.name # name sent to dragon env + assert req_name.startswith(step0.name) + + req_policy_cpu_affinity = dragon_run_request.policy.cpu_affinity + assert not req_policy_cpu_affinity # default should be empty list + + req_policy_gpu_affinity = dragon_run_request.policy.gpu_affinity + assert not req_policy_gpu_affinity # default should be empty list + + +def test_run_step_success_batch( + monkeypatch: pytest.MonkeyPatch, dragon_batch_step: DragonBatchStep +) -> None: + """Verify that the dragon launcher sends the correctly formatted request + for a batch step""" + mock_connector = MagicMock() # DragonConnector() + mock_connector.is_connected = True + mock_connector.send_request = MagicMock( + return_value=DragonRunResponse(step_id=dragon_batch_step.name) + ) + + launcher = DragonLauncher() + launcher._connector = mock_connector + launcher.task_manager.start_and_wait = MagicMock(return_value=(0, "success", "")) + + result = launcher.run(dragon_batch_step) + + # verify the successfully executed step name is in the result + assert dragon_batch_step.name not in result + assert result + + send_invocation = mock_connector.send_request + + # verify a batch request is not sent through the dragon connector + send_invocation.assert_not_called() + launcher.task_manager.start_and_wait.assert_called_once() + + args = launcher.task_manager.start_and_wait.call_args[0] + + # verify the batch script is executed + launch_cmd = dragon_batch_step.get_launch_cmd() + for stmt in launch_cmd: + assert stmt in args[0] # args[0] is the cmd list sent to subprocess.Popen diff --git a/tests/test_dragon_run_policy.py b/tests/test_dragon_run_policy.py new file mode 100644 index 0000000000..1d8d069fab --- /dev/null +++ b/tests/test_dragon_run_policy.py @@ -0,0 +1,371 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib + +import pytest + +from smartsim._core.launcher.step.dragonStep import DragonBatchStep, DragonStep +from smartsim.settings.dragonRunSettings import DragonRunSettings +from smartsim.settings.slurmSettings import SbatchSettings + +try: + from dragon.infrastructure.policy import Policy + + import smartsim._core.entrypoints.dragon as drg + from smartsim._core.launcher.dragon.dragonBackend import DragonBackend + + dragon_loaded = True +except: + dragon_loaded = False + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + +from smartsim._core.schemas.dragonRequests import * +from smartsim._core.schemas.dragonResponses import * + + +@pytest.fixture +def dragon_batch_step(test_dir: str) -> "DragonBatchStep": + """Fixture for creating a default batch of steps for a dragon launcher""" + test_path = pathlib.Path(test_dir) + + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + # ensure the status_dir is set + status_dir = (test_path / ".smartsim" / "logs").as_posix() + batch_step.meta["status_dir"] = status_dir + + # create some steps to verify the requests file output changes + rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) + rs1 = DragonRunSettings(exe="sleep", exe_args=["2"]) + rs2 = DragonRunSettings(exe="sleep", exe_args=["3"]) + rs3 = DragonRunSettings(exe="sleep", exe_args=["4"]) + + names = "test00", "test01", "test02", "test03" + settings = rs0, rs1, rs2, rs3 + + # create steps with: + # no affinity, cpu affinity only, gpu affinity only, cpu and gpu affinity + cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] + gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] + + # assign some unique affinities to each run setting instance + for index, rs in enumerate(settings): + if gpu_affinities[index]: + rs.set_node_feature("gpu") + rs.set_cpu_affinity(cpu_affinities[index]) + rs.set_gpu_affinity(gpu_affinities[index]) + + steps = list( + DragonStep(name_, test_dir, rs_) for name_, rs_ in zip(names, settings) + ) + + for index, step in enumerate(steps): + # ensure meta is configured... + step.meta["status_dir"] = status_dir + # ... and put all the steps into the batch + batch_step.add_to_batch(steps[index]) + + return batch_step + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +@pytest.mark.parametrize( + "dragon_request", + [ + pytest.param(DragonHandshakeRequest(), id="DragonHandshakeRequest"), + pytest.param(DragonShutdownRequest(), id="DragonShutdownRequest"), + pytest.param( + DragonBootstrapRequest(address="localhost"), id="DragonBootstrapRequest" + ), + ], +) +def test_create_run_policy_non_run_request(dragon_request: DragonRequest) -> None: + """Verify that a default policy is returned when a request is + not attempting to start a new proccess (e.g. a DragonRunRequest)""" + policy = DragonBackend.create_run_policy(dragon_request, "localhost") + + assert policy is not None, "Default policy was not returned" + assert ( + policy.device == Policy.Device.DEFAULT + ), "Default device was not Device.DEFAULT" + assert policy.cpu_affinity == [], "Default cpu affinity was not empty" + assert policy.gpu_affinity == [], "Default gpu affinity was not empty" + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_create_run_policy_run_request_no_run_policy() -> None: + """Verify that a policy specifying no policy is returned with all default + values (no device, empty cpu & gpu affinity)""" + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + # policy= # <--- skipping this + ) + + policy = DragonBackend.create_run_policy(run_req, "localhost") + + assert policy.device == Policy.Device.DEFAULT + assert set(policy.cpu_affinity) == set() + assert policy.gpu_affinity == [] + assert policy.affinity == Policy.Affinity.DEFAULT + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_create_run_policy_run_request_default_run_policy() -> None: + """Verify that a policy specifying no affinity is returned with + default value for device and empty affinity lists""" + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(), # <--- passing default values + ) + + policy = DragonBackend.create_run_policy(run_req, "localhost") + + assert set(policy.cpu_affinity) == set() + assert set(policy.gpu_affinity) == set() + assert policy.affinity == Policy.Affinity.DEFAULT + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_create_run_policy_run_request_cpu_affinity_no_device() -> None: + """Verify that a input policy specifying a CPU affinity but lacking the device field + produces a Dragon Policy with the CPU device specified""" + affinity = set([0, 2, 4]) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(cpu_affinity=list(affinity)), # <-- no device spec + ) + + policy = DragonBackend.create_run_policy(run_req, "localhost") + + assert set(policy.cpu_affinity) == affinity + assert policy.gpu_affinity == [] + assert policy.affinity == Policy.Affinity.SPECIFIC + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_create_run_policy_run_request_cpu_affinity() -> None: + """Verify that a policy specifying CPU affinity is returned as expected""" + affinity = set([0, 2, 4]) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(cpu_affinity=list(affinity)), + ) + + policy = DragonBackend.create_run_policy(run_req, "localhost") + + assert set(policy.cpu_affinity) == affinity + assert policy.gpu_affinity == [] + assert policy.affinity == Policy.Affinity.SPECIFIC + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_create_run_policy_run_request_gpu_affinity() -> None: + """Verify that a policy specifying GPU affinity is returned as expected""" + affinity = set([0, 2, 4]) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(device="gpu", gpu_affinity=list(affinity)), + ) + + policy = DragonBackend.create_run_policy(run_req, "localhost") + + assert policy.cpu_affinity == [] + assert set(policy.gpu_affinity) == set(affinity) + assert policy.affinity == Policy.Affinity.SPECIFIC + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_dragon_run_policy_from_run_args() -> None: + """Verify that a DragonRunPolicy is created from a dictionary of run arguments""" + run_args = { + "gpu-affinity": "0,1,2", + "cpu-affinity": "3,4,5,6", + } + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [3, 4, 5, 6] + assert policy.gpu_affinity == [0, 1, 2] + + +def test_dragon_run_policy_from_run_args_empty() -> None: + """Verify that a DragonRunPolicy is created from an empty + dictionary of run arguments""" + run_args = {} + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [] + assert policy.gpu_affinity == [] + + +def test_dragon_run_policy_from_run_args_cpu_affinity() -> None: + """Verify that a DragonRunPolicy is created from a dictionary + of run arguments containing a CPU affinity""" + run_args = { + "cpu-affinity": "3,4,5,6", + } + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [3, 4, 5, 6] + assert policy.gpu_affinity == [] + + +def test_dragon_run_policy_from_run_args_gpu_affinity() -> None: + """Verify that a DragonRunPolicy is created from a dictionary + of run arguments containing a GPU affinity""" + run_args = { + "gpu-affinity": "0, 1, 2", + } + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [] + assert policy.gpu_affinity == [0, 1, 2] + + +def test_dragon_run_policy_from_run_args_invalid_gpu_affinity() -> None: + """Verify that a DragonRunPolicy is NOT created from a dictionary + of run arguments with an invalid GPU affinity""" + run_args = { + "gpu-affinity": "0,-1,2", + } + + with pytest.raises(SmartSimError) as ex: + DragonRunPolicy.from_run_args(run_args) + + assert "DragonRunPolicy" in ex.value.args[0] + + +def test_dragon_run_policy_from_run_args_invalid_cpu_affinity() -> None: + """Verify that a DragonRunPolicy is NOT created from a dictionary + of run arguments with an invalid CPU affinity""" + run_args = { + "cpu-affinity": "3,4,5,-6", + } + + with pytest.raises(SmartSimError) as ex: + DragonRunPolicy.from_run_args(run_args) + + assert "DragonRunPolicy" in ex.value.args[0] + + +def test_dragon_run_policy_from_run_args_ignore_empties_gpu() -> None: + """Verify that a DragonRunPolicy is created from a dictionary + of run arguments and ignores empty values in the serialized gpu list""" + run_args = { + "gpu-affinity": "0,,2", + } + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [] + assert policy.gpu_affinity == [0, 2] + + +def test_dragon_run_policy_from_run_args_ignore_empties_cpu() -> None: + """Verify that a DragonRunPolicy is created from a dictionary + of run arguments and ignores empty values in the serialized cpu list""" + run_args = { + "cpu-affinity": "3,4,,6,", + } + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [3, 4, 6] + assert policy.gpu_affinity == [] + + +def test_dragon_run_policy_from_run_args_null_gpu_affinity() -> None: + """Verify that a DragonRunPolicy is created if a null value is encountered + in the gpu-affinity list""" + run_args = { + "gpu-affinity": None, + "cpu-affinity": "3,4,5,6", + } + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [3, 4, 5, 6] + assert policy.gpu_affinity == [] + + +def test_dragon_run_policy_from_run_args_null_cpu_affinity() -> None: + """Verify that a DragonRunPolicy is created if a null value is encountered + in the cpu-affinity list""" + run_args = {"gpu-affinity": "0,1,2", "cpu-affinity": None} + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [] + assert policy.gpu_affinity == [0, 1, 2] diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_run_request.py similarity index 64% rename from tests/test_dragon_backend.py rename to tests/test_dragon_run_request.py index f284f38d99..94c17c222a 100644 --- a/tests/test_dragon_backend.py +++ b/tests/test_dragon_run_request.py @@ -31,19 +31,17 @@ from unittest.mock import MagicMock import pytest +from pydantic import ValidationError # The tests in this file belong to the group_b group -pytestmark = pytest.mark.group_a +pytestmark = pytest.mark.group_b try: import dragon -except ImportError: - pass -else: - pytest.skip( - reason="Using dragon as launcher, not running Dragon unit tests", - allow_module_level=True, - ) + + dragon_loaded = True +except: + dragon_loaded = False from smartsim._core.config import CONFIG from smartsim._core.schemas.dragonRequests import * @@ -59,10 +57,36 @@ class NodeMock(MagicMock): + def __init__( + self, name: t.Optional[str] = None, num_gpus: int = 2, num_cpus: int = 8 + ) -> None: + super().__init__() + self._mock_id = name + NodeMock._num_gpus = num_gpus + NodeMock._num_cpus = num_cpus + @property def hostname(self) -> str: + if self._mock_id: + return self._mock_id return create_short_id_str() + @property + def num_cpus(self) -> str: + return NodeMock._num_cpus + + @property + def num_gpus(self) -> str: + return NodeMock._num_gpus + + def _set_id(self, value: str) -> None: + self._mock_id = value + + def gpus(self, parent: t.Any = None) -> t.List[str]: + if self._num_gpus: + return [f"{self.hostname}-gpu{i}" for i in range(NodeMock._num_gpus)] + return [] + class GroupStateMock(MagicMock): def Running(self) -> MagicMock: @@ -78,13 +102,19 @@ class ProcessGroupMock(MagicMock): puids = [121, 122] -def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": +def node_mock() -> NodeMock: + return NodeMock() + + +def get_mock_backend( + monkeypatch: pytest.MonkeyPatch, num_gpus: int = 2 +) -> "DragonBackend": process_mock = MagicMock(returncode=0) process_group_mock = MagicMock(**{"Process.return_value": ProcessGroupMock()}) process_module_mock = MagicMock() process_module_mock.Process = process_mock - node_mock = NodeMock() + node_mock = NodeMock(num_gpus=num_gpus) system_mock = MagicMock(nodes=["node1", "node2", "node3"]) monkeypatch.setitem( sys.modules, @@ -199,6 +229,7 @@ def set_mock_group_infos( return group_infos +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_handshake_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) @@ -209,6 +240,7 @@ def test_handshake_request(monkeypatch: pytest.MonkeyPatch) -> None: assert handshake_resp.dragon_pid == 99999 +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) run_req = DragonRunRequest( @@ -259,6 +291,7 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: assert not dragon_backend._running_steps +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_deny_run_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) @@ -284,6 +317,78 @@ def test_deny_run_request(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend.group_infos[step_id].status == SmartSimStatus.STATUS_FAILED +def test_run_request_with_empty_policy(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that a policy is applied to a run request""" + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=None, + ) + assert run_req.policy is None + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that a policy is applied to a run request""" + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(cpu_affinity=[0, 1]), + ) + + run_resp = dragon_backend.process_request(run_req) + assert isinstance(run_resp, DragonRunResponse) + + step_id = run_resp.step_id + assert dragon_backend._queued_steps[step_id] == run_req + + mock_process_group = MagicMock(puids=[123, 124]) + + dragon_backend._group_infos[step_id].process_group = mock_process_group + dragon_backend._group_infos[step_id].puids = [123, 124] + dragon_backend._start_steps() + + assert dragon_backend._running_steps == [step_id] + assert len(dragon_backend._queued_steps) == 0 + assert len(dragon_backend._free_hosts) == 1 + assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id + assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id + + monkeypatch.setattr( + dragon_backend._group_infos[step_id].process_group, "status", "Running" + ) + + dragon_backend._update() + + assert dragon_backend._running_steps == [step_id] + assert len(dragon_backend._queued_steps) == 0 + assert len(dragon_backend._free_hosts) == 1 + assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id + assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id + + dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED + + dragon_backend._update() + assert not dragon_backend._running_steps + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_udpate_status_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) @@ -300,6 +405,7 @@ def test_udpate_status_request(monkeypatch: pytest.MonkeyPatch) -> None: } +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_stop_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) group_infos = set_mock_group_infos(monkeypatch, dragon_backend) @@ -331,6 +437,7 @@ def test_stop_request(monkeypatch: pytest.MonkeyPatch) -> None: assert len(dragon_backend._free_hosts) == 3 +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize( "immediate, kill_jobs, frontend_shutdown", [ @@ -389,6 +496,7 @@ def test_shutdown_request( assert dragon_backend._has_cooled_down == kill_jobs +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize("telemetry_flag", ["0", "1"]) def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) -> None: monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", telemetry_flag) @@ -404,6 +512,7 @@ def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) - assert dragon_backend.cooldown_period == expected_cooldown +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_heartbeat_and_time(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) first_heartbeat = dragon_backend.last_heartbeat @@ -412,6 +521,7 @@ def test_heartbeat_and_time(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend.last_heartbeat > first_heartbeat +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize("num_nodes", [1, 3, 100]) def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None: dragon_backend = get_mock_backend(monkeypatch) @@ -432,6 +542,119 @@ def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None: ) +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +@pytest.mark.parametrize("affinity", [[0], [0, 1], list(range(8))]) +def test_can_honor_cpu_affinity( + monkeypatch: pytest.MonkeyPatch, affinity: t.List[int] +) -> None: + """Verify that valid CPU affinities are accepted""" + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(cpu_affinity=affinity), + ) + + assert dragon_backend._can_honor(run_req)[0] + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_can_honor_cpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that invalid CPU affinities are NOT accepted + NOTE: negative values are captured by the Pydantic schema""" + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(cpu_affinity=list(range(9))), + ) + + assert not dragon_backend._can_honor(run_req)[0] + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +@pytest.mark.parametrize("affinity", [[0], [0, 1]]) +def test_can_honor_gpu_affinity( + monkeypatch: pytest.MonkeyPatch, affinity: t.List[int] +) -> None: + """Verify that valid GPU affinities are accepted""" + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(gpu_affinity=affinity), + ) + + assert dragon_backend._can_honor(run_req)[0] + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_can_honor_gpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that invalid GPU affinities are NOT accepted + NOTE: negative values are captured by the Pydantic schema""" + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(gpu_affinity=list(range(3))), + ) + + assert not dragon_backend._can_honor(run_req)[0] + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_can_honor_gpu_device_not_available(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that a request for a GPU if none exists is not accepted""" + + # create a mock node class that always reports no GPUs available + dragon_backend = get_mock_backend(monkeypatch, num_gpus=0) + + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + # specify GPU device w/no affinity + policy=DragonRunPolicy(gpu_affinity=[0]), + ) + + assert not dragon_backend._can_honor(run_req)[0] + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_get_id(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) step_id = next(dragon_backend._step_ids) @@ -440,6 +663,7 @@ def test_get_id(monkeypatch: pytest.MonkeyPatch) -> None: assert step_id != next(dragon_backend._step_ids) +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_view(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) set_mock_group_infos(monkeypatch, dragon_backend) @@ -447,17 +671,21 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None: expected_message = textwrap.dedent(f"""\ Dragon server backend update - | Host | Status | - |---------|----------| + | Host | Status | + |--------|----------| | {hosts[0]} | Busy | | {hosts[1]} | Free | | {hosts[2]} | Free | | Step | Status | Hosts | Return codes | Num procs | - |----------|--------------|-----------------|----------------|-------------| + |----------|--------------|-------------|----------------|-------------| | abc123-1 | Running | {hosts[0]} | | 1 | | del999-2 | Cancelled | {hosts[1]} | -9 | 1 | | c101vz-3 | Completed | {hosts[1]},{hosts[2]} | 0 | 2 | | 0ghjk1-4 | Failed | {hosts[2]} | -1 | 1 | | ljace0-5 | NeverStarted | | | 0 |""") - assert dragon_backend.status_message == expected_message + # get rid of white space to make the comparison easier + actual_msg = dragon_backend.status_message.replace(" ", "") + expected_message = expected_message.replace(" ", "") + + assert actual_msg == expected_message diff --git a/tests/test_dragon_run_request_nowlm.py b/tests/test_dragon_run_request_nowlm.py new file mode 100644 index 0000000000..afd25aa9d7 --- /dev/null +++ b/tests/test_dragon_run_request_nowlm.py @@ -0,0 +1,105 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest +from pydantic import ValidationError + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +from smartsim._core.schemas.dragonRequests import * +from smartsim._core.schemas.dragonResponses import * + + +def test_run_request_with_null_policy(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that an empty policy does not cause an error""" + # dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=None, + ) + assert run_req.policy is None + + +def test_run_request_with_empty_policy(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that a non-empty policy is set correctly""" + # dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(), + ) + assert run_req.policy is not None + assert not run_req.policy.cpu_affinity + assert not run_req.policy.gpu_affinity + + +@pytest.mark.parametrize( + "device,cpu_affinity,gpu_affinity", + [ + pytest.param("cpu", [-1], [], id="cpu_affinity"), + pytest.param("gpu", [], [-1], id="gpu_affinity"), + ], +) +def test_run_request_with_negative_affinity( + device: str, + cpu_affinity: t.List[int], + gpu_affinity: t.List[int], +) -> None: + """Verify that invalid affinity values fail validation""" + with pytest.raises(ValidationError) as ex: + DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy( + cpu_affinity=cpu_affinity, gpu_affinity=gpu_affinity + ), + ) + + assert f"{device}_affinity" in str(ex.value.args[0]) + assert "NumberNotGeError" in str(ex.value.args[0]) diff --git a/tests/test_dragon_runsettings.py b/tests/test_dragon_runsettings.py new file mode 100644 index 0000000000..34e8510e82 --- /dev/null +++ b/tests/test_dragon_runsettings.py @@ -0,0 +1,98 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from smartsim.settings import DragonRunSettings + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_a + + +def test_dragon_runsettings_nodes(): + """Verify that node count is set correctly""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + exp_value = 3 + rs.set_nodes(exp_value) + assert rs.run_args["nodes"] == exp_value + + exp_value = 9 + rs.set_nodes(exp_value) + assert rs.run_args["nodes"] == exp_value + + +def test_dragon_runsettings_tasks_per_node(): + """Verify that tasks per node is set correctly""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + exp_value = 3 + rs.set_tasks_per_node(exp_value) + assert rs.run_args["tasks-per-node"] == exp_value + + exp_value = 7 + rs.set_tasks_per_node(exp_value) + assert rs.run_args["tasks-per-node"] == exp_value + + +def test_dragon_runsettings_cpu_affinity(): + """Verify that the CPU affinity is set correctly""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + exp_value = [0, 1, 2, 3] + rs.set_cpu_affinity([0, 1, 2, 3]) + assert rs.run_args["cpu-affinity"] == ",".join(str(val) for val in exp_value) + + # ensure the value is not changed when we extend the list + exp_value.extend([4, 5, 6]) + assert rs.run_args["cpu-affinity"] != ",".join(str(val) for val in exp_value) + + rs.set_cpu_affinity(exp_value) + assert rs.run_args["cpu-affinity"] == ",".join(str(val) for val in exp_value) + + # ensure the value is not changed when we extend the list + rs.run_args["cpu-affinity"] = "7,8,9" + assert rs.run_args["cpu-affinity"] != ",".join(str(val) for val in exp_value) + + +def test_dragon_runsettings_gpu_affinity(): + """Verify that the GPU affinity is set correctly""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + exp_value = [0, 1, 2, 3] + rs.set_gpu_affinity([0, 1, 2, 3]) + assert rs.run_args["gpu-affinity"] == ",".join(str(val) for val in exp_value) + + # ensure the value is not changed when we extend the list + exp_value.extend([4, 5, 6]) + assert rs.run_args["gpu-affinity"] != ",".join(str(val) for val in exp_value) + + rs.set_gpu_affinity(exp_value) + assert rs.run_args["gpu-affinity"] == ",".join(str(val) for val in exp_value) + + # ensure the value is not changed when we extend the list + rs.run_args["gpu-affinity"] = "7,8,9" + assert rs.run_args["gpu-affinity"] != ",".join(str(val) for val in exp_value) diff --git a/tests/test_dragon_step.py b/tests/test_dragon_step.py new file mode 100644 index 0000000000..19f408e0bd --- /dev/null +++ b/tests/test_dragon_step.py @@ -0,0 +1,394 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import pathlib +import shutil +import sys +import typing as t + +import pytest + +from smartsim._core.launcher.step.dragonStep import DragonBatchStep, DragonStep +from smartsim.settings import DragonRunSettings +from smartsim.settings.pbsSettings import QsubBatchSettings +from smartsim.settings.slurmSettings import SbatchSettings + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +from smartsim._core.schemas.dragonRequests import * +from smartsim._core.schemas.dragonResponses import * + + +@pytest.fixture +def dragon_batch_step(test_dir: str) -> DragonBatchStep: + """Fixture for creating a default batch of steps for a dragon launcher""" + test_path = pathlib.Path(test_dir) + + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + # ensure the status_dir is set + status_dir = (test_path / ".smartsim" / "logs").as_posix() + batch_step.meta["status_dir"] = status_dir + + # create some steps to verify the requests file output changes + rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) + rs1 = DragonRunSettings(exe="sleep", exe_args=["2"]) + rs2 = DragonRunSettings(exe="sleep", exe_args=["3"]) + rs3 = DragonRunSettings(exe="sleep", exe_args=["4"]) + + names = "test00", "test01", "test02", "test03" + settings = rs0, rs1, rs2, rs3 + + # create steps with: + # no affinity, cpu affinity only, gpu affinity only, cpu and gpu affinity + cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] + gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] + + # assign some unique affinities to each run setting instance + for index, rs in enumerate(settings): + if gpu_affinities[index]: + rs.set_node_feature("gpu") + rs.set_cpu_affinity(cpu_affinities[index]) + rs.set_gpu_affinity(gpu_affinities[index]) + + steps = list( + DragonStep(name_, test_dir, rs_) for name_, rs_ in zip(names, settings) + ) + + for index, step in enumerate(steps): + # ensure meta is configured... + step.meta["status_dir"] = status_dir + # ... and put all the steps into the batch + batch_step.add_to_batch(steps[index]) + + return batch_step + + +def get_request_path_from_batch_script(launch_cmd: t.List[str]) -> pathlib.Path: + """Helper method for finding the path to a request file from the launch command""" + script_path = pathlib.Path(launch_cmd[-1]) + batch_script = script_path.read_text(encoding="utf-8") + batch_statements = [line for line in batch_script.split("\n") if line] + entrypoint_cmd = batch_statements[-1] + requests_file = pathlib.Path(entrypoint_cmd.split()[-1]) + return requests_file + + +def test_dragon_step_creation(test_dir: str) -> None: + """Verify that the step is created with the values provided""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + original_name = "test" + step = DragonStep(original_name, test_dir, rs) + + # confirm the name has been made unique to avoid conflicts + assert step.name != original_name + assert step.entity_name == original_name + assert step.cwd == test_dir + assert step.step_settings is not None + + +def test_dragon_step_name_uniqueness(test_dir: str) -> None: + """Verify that step name is unique and independent of step content""" + + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + original_name = "test" + + num_steps = 100 + steps = [DragonStep(original_name, test_dir, rs) for _ in range(num_steps)] + + # confirm the name has been made unique in each step + step_names = {step.name for step in steps} + assert len(step_names) == num_steps + + +def test_dragon_step_launch_cmd(test_dir: str) -> None: + """Verify the expected launch cmd is generated w/minimal settings""" + exp_exe = "sleep" + exp_exe_args = "1" + rs = DragonRunSettings(exe=exp_exe, exe_args=[exp_exe_args]) + + original_name = "test" + step = DragonStep(original_name, test_dir, rs) + + launch_cmd = step.get_launch_cmd() + assert len(launch_cmd) == 2 + + # we'll verify the exe_args and exe name are handled correctly + exe, args = launch_cmd + assert exp_exe in exe + assert exp_exe_args in args + + # also, verify that a string exe_args param instead of list is handled correctly + exp_exe_args = "1 2 3" + rs = DragonRunSettings(exe=exp_exe, exe_args=exp_exe_args) + step = DragonStep(original_name, test_dir, rs) + launch_cmd = step.get_launch_cmd() + assert len(launch_cmd) == 4 # "/foo/bar/sleep 1 2 3" + + +def test_dragon_step_launch_cmd_multi_arg(test_dir: str) -> None: + """Verify the expected launch cmd is generated when multiple arguments + are passed to run settings""" + exp_exe = "sleep" + arg0, arg1, arg2 = "1", "2", "3" + rs = DragonRunSettings(exe=exp_exe, exe_args=[arg0, arg1, arg2]) + + original_name = "test" + + step = DragonStep(original_name, test_dir, rs) + + launch_cmd = step.get_launch_cmd() + assert len(launch_cmd) == 4 + + exe, *args = launch_cmd + assert exp_exe in exe + assert arg0 in args + assert arg1 in args + assert arg2 in args + + +def test_dragon_step_launch_cmd_no_bash( + test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Verify that requirement for bash shell is checked""" + exp_exe = "sleep" + arg0, arg1, arg2 = "1", "2", "3" + rs = DragonRunSettings(exe=exp_exe, exe_args=[arg0, arg1, arg2]) + rs.colocated_db_settings = {"foo": "bar"} # triggers bash lookup + + original_name = "test" + step = DragonStep(original_name, test_dir, rs) + + with pytest.raises(RuntimeError) as ex, monkeypatch.context() as ctx: + ctx.setattr(shutil, "which", lambda _: None) + step.get_launch_cmd() + + # verify the exception thrown is the one we're looking for + assert "Could not find" in ex.value.args[0] + + +def test_dragon_step_colocated_db() -> None: + # todo: implement a test for the branch where bash is found and + # run_settings.colocated_db_settings is set + ... + + +def test_dragon_step_container() -> None: + # todo: implement a test for the branch where run_settings.container + # is an instance of class `Singularity` + ... + + +def test_dragon_step_run_settings_accessor(test_dir: str) -> None: + """Verify the run settings passed to the step are copied correctly and + are not inadvertently modified outside the step""" + exp_exe = "sleep" + arg0, arg1, arg2 = "1", "2", "3" + rs = DragonRunSettings(exe=exp_exe, exe_args=[arg0, arg1, arg2]) + + original_name = "test" + step = DragonStep(original_name, test_dir, rs) + rs_output = step.run_settings + + assert rs.exe == rs_output.exe + assert rs.exe_args == rs_output.exe_args + + # ensure we have a deep copy + rs.exe = "foo" + assert id(step.run_settings) != id(rs) + assert step.run_settings.exe != rs.exe + + +def test_dragon_batch_step_creation(test_dir: str) -> None: + """Verify that the batch step is created with the values provided""" + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + # confirm the name has been made unique to avoid conflicts + assert batch_step.name != batch_step_name + assert batch_step.entity_name == batch_step_name + assert batch_step.cwd == test_dir + assert batch_step.batch_settings is not None + assert batch_step.managed + + +def test_dragon_batch_step_add_to_batch(test_dir: str) -> None: + """Verify that steps are added to the batch correctly""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + name0, name1, name2 = "test00", "test01", "test02" + step0 = DragonStep(name0, test_dir, rs) + step1 = DragonStep(name1, test_dir, rs) + step2 = DragonStep(name2, test_dir, rs) + + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + assert len(batch_step.steps) == 0 + + batch_step.add_to_batch(step0) + assert len(batch_step.steps) == 1 + assert name0 in ",".join({step.name for step in batch_step.steps}) + + batch_step.add_to_batch(step1) + assert len(batch_step.steps) == 2 + assert name1 in ",".join({step.name for step in batch_step.steps}) + + batch_step.add_to_batch(step2) + assert len(batch_step.steps) == 3 + assert name2 in ",".join({step.name for step in batch_step.steps}) + + +def test_dragon_batch_step_get_launch_command_meta_fail(test_dir: str) -> None: + """Verify that the batch launch command cannot be generated without + having the status directory set in the step metadata""" + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + with pytest.raises(KeyError) as ex: + batch_step.get_launch_cmd() + + +@pytest.mark.parametrize( + "batch_settings_class,batch_exe,batch_header,node_spec_tpl", + [ + pytest.param( + SbatchSettings, "sbatch", "#SBATCH", "#SBATCH --nodes={0}", id="sbatch" + ), + pytest.param(QsubBatchSettings, "qsub", "#PBS", "#PBS -l nodes={0}", id="qsub"), + ], +) +def test_dragon_batch_step_get_launch_command( + test_dir: str, + batch_settings_class: t.Type, + batch_exe: str, + batch_header: str, + node_spec_tpl: str, +) -> None: + """Verify that the batch launch command is properly generated and + the expected side effects are present (writing script file to disk)""" + test_path = pathlib.Path(test_dir) + + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = batch_settings_class(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + # ensure the status_dir is set + status_dir = (test_path / ".smartsim" / "logs").as_posix() + batch_step.meta["status_dir"] = status_dir + + launch_cmd = batch_step.get_launch_cmd() + assert launch_cmd + + full_cmd = " ".join(launch_cmd) + assert batch_exe in full_cmd # verify launcher running the batch + assert test_dir in full_cmd # verify outputs are sent to expected directory + assert "batch_step.sh" in full_cmd # verify batch script name is in the command + + # ...verify that the script file is written when getting the launch command + script_path = pathlib.Path(launch_cmd[-1]) + assert script_path.exists() + assert len(script_path.read_bytes()) > 0 + + batch_script = script_path.read_text(encoding="utf-8") + + # ...verify the script file has the expected batch script header content + assert batch_header in batch_script + assert node_spec_tpl.format(num_nodes) in batch_script # verify node count is set + + # ...verify the script has the expected entrypoint command + batch_statements = [line for line in batch_script.split("\n") if line] + python_path = sys.executable + + entrypoint_cmd = batch_statements[-1] + assert python_path in entrypoint_cmd + assert "smartsim._core.entrypoints.dragon_client +submit" in entrypoint_cmd + + +def test_dragon_batch_step_write_request_file_no_steps(test_dir: str) -> None: + """Verify that the batch launch command writes an appropriate request file + if no steps are attached""" + test_path = pathlib.Path(test_dir) + + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + # ensure the status_dir is set + status_dir = (test_path / ".smartsim" / "logs").as_posix() + batch_step.meta["status_dir"] = status_dir + + launch_cmd = batch_step.get_launch_cmd() + requests_file = get_request_path_from_batch_script(launch_cmd) + + # no steps have been added yet, so the requests file should be a serialized, empty list + assert requests_file.read_text(encoding="utf-8") == "[]" + + +def test_dragon_batch_step_write_request_file( + dragon_batch_step: DragonBatchStep, +) -> None: + """Verify that the batch launch command writes an appropriate request file + for the set of attached steps""" + # create steps with: + # no affinity, cpu affinity only, gpu affinity only, cpu and gpu affinity + cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] + gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] + + launch_cmd = dragon_batch_step.get_launch_cmd() + requests_file = get_request_path_from_batch_script(launch_cmd) + + requests_text = requests_file.read_text(encoding="utf-8") + requests_json: t.List[str] = json.loads(requests_text) + + # verify that there is an item in file for each step added to the batch + assert len(requests_json) == len(dragon_batch_step.steps) + + for index, req in enumerate(requests_json): + req_type, req_data = req.split("|", 1) + # the only steps added are to execute apps, requests should be of type "run" + assert req_type == "run" + + run_request = DragonRunRequest(**json.loads(req_data)) + assert run_request + assert run_request.policy.cpu_affinity == cpu_affinities[index] + assert run_request.policy.gpu_affinity == gpu_affinities[index] From 0030a4af2edbba211bf8f898456f3f20389f428c Mon Sep 17 00:00:00 2001 From: Chris McBride <3595025+ankona@users.noreply.github.com> Date: Wed, 17 Jul 2024 16:47:32 -0400 Subject: [PATCH 08/60] Revert "Add ability to specify hardware policies on dragon run requests" (#637) Reverts CrayLabs/SmartSim#631 --- doc/changelog.md | 1 - doc/dragon.rst | 28 -- .../lattice/online_analysis.ipynb | 6 - .../_core/launcher/dragon/dragonBackend.py | 85 +--- .../_core/launcher/dragon/dragonLauncher.py | 6 - smartsim/_core/launcher/step/dragonStep.py | 10 +- smartsim/_core/launcher/step/step.py | 3 +- smartsim/_core/schemas/dragonRequests.py | 41 +- smartsim/settings/dragonRunSettings.py | 32 -- ..._run_request.py => test_dragon_backend.py} | 256 +----------- tests/test_dragon_client.py | 192 --------- tests/test_dragon_launcher.py | 223 +--------- tests/test_dragon_run_policy.py | 371 ----------------- tests/test_dragon_run_request_nowlm.py | 105 ----- tests/test_dragon_runsettings.py | 98 ----- tests/test_dragon_step.py | 394 ------------------ 16 files changed, 25 insertions(+), 1826 deletions(-) rename tests/{test_dragon_run_request.py => test_dragon_backend.py} (64%) delete mode 100644 tests/test_dragon_client.py delete mode 100644 tests/test_dragon_run_policy.py delete mode 100644 tests/test_dragon_run_request_nowlm.py delete mode 100644 tests/test_dragon_runsettings.py delete mode 100644 tests/test_dragon_step.py diff --git a/doc/changelog.md b/doc/changelog.md index 820b76f0fd..ee41fabf88 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,7 +13,6 @@ Jump to: Description -- Add hardware pinning capability when using dragon - Add TorchWorker first implementation and mock inference app example - Add EnvironmentConfigLoader for ML Worker Manager - Add Model schema with model metadata included diff --git a/doc/dragon.rst b/doc/dragon.rst index e19b40e4b7..0bf6a8ea3c 100644 --- a/doc/dragon.rst +++ b/doc/dragon.rst @@ -65,34 +65,6 @@ In the next sections, we detail how Dragon is integrated into SmartSim. For more information on HPC launchers, visit the :ref:`Run Settings` page. -Hardware Pinning -================ - -Dragon also enables users to specify hardware constraints using ``DragonRunSettings``. CPU -and GPU affinity can be specified using the ``DragonRunSettings`` object. The following -example demonstrates how to specify CPU affinity and GPU affinities simultaneously. Note -that affinities are passed as a list of device indices. - -.. code-block:: python - - # Because "dragon" was specified as the launcher during Experiment initialization, - # create_run_settings will return a DragonRunSettings object - rs = exp.create_run_settings(exe="mpi_app", - exe_args=["--option", "value"], - env_vars={"MYVAR": "VALUE"}) - - # Request the first 8 CPUs for this job - rs.set_cpu_affinity(list(range(9))) - - # Request the first two GPUs on the node for this job - rs.set_gpu_affinity([0, 1]) - -.. note:: - - SmartSim launches jobs in the order they are received on the first available - host in a round-robin pattern. To ensure a process is launched on a node with - specific features, configure a hostname constraint. - ================= The Dragon Server ================= diff --git a/doc/tutorials/online_analysis/lattice/online_analysis.ipynb b/doc/tutorials/online_analysis/lattice/online_analysis.ipynb index c5f58fa97b..412b63dd01 100644 --- a/doc/tutorials/online_analysis/lattice/online_analysis.ipynb +++ b/doc/tutorials/online_analysis/lattice/online_analysis.ipynb @@ -378,7 +378,6 @@ }, { "cell_type": "code", - "id": "6f3ed63d-e324-443d-9b68-b2cf618d31c7", "execution_count": 7, "metadata": {}, "outputs": [ @@ -400,7 +399,6 @@ }, { "cell_type": "markdown", - "id": "96c154fe-5ca8-4d89-91f8-8fd4e75cb80e", "metadata": {}, "source": [ "We then apply the function `probe_points` to the `ux` and `uy` tensors computed in the last time step of the previous simulation. Note that all tensors are already on the DB, thus we can reference them by name. Finally, we download and plot the output (a 2D velocity field), which is stored as `probe_u` on the DB." @@ -408,7 +406,6 @@ }, { "cell_type": "code", - "id": "36e3b415-dcc1-4d25-9cce-52388146a4bb", "execution_count": 8, "metadata": {}, "outputs": [ @@ -435,7 +432,6 @@ }, { "cell_type": "markdown", - "id": "9d7e4966-a0de-480c-9556-936197a5a5d2", "metadata": {}, "source": [ "### Uploading a function inline\n", @@ -457,7 +453,6 @@ }, { "cell_type": "markdown", - "id": "1c4daf43-34d0-482a-b9b5-b3b6f1e173c4", "metadata": {}, "source": [ "We then store the function on the DB under the key `norm_function`." @@ -475,7 +470,6 @@ }, { "cell_type": "markdown", - "id": "19409ac6-e118-44db-a847-2d905fdf0331", "metadata": {}, "source": [ "Note that the key we used identifies a functional unit containing the function itself: this is similar to the key used to store the `probe` script above. When we want to run the function, we just call it with `run_script`, by indicating the `script` key as `\"norm_function\"` and the name of the function itself as `\"compute_norm\"`." diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 2938746361..dcc5c8392b 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -214,12 +214,9 @@ def group_infos(self) -> dict[str, ProcessGroupInfo]: def _initialize_hosts(self) -> None: with self._queue_lock: self._hosts: t.List[str] = sorted( - node for node in dragon_machine.System().nodes + dragon_machine.Node(node).hostname + for node in dragon_machine.System().nodes ) - self._nodes = [dragon_machine.Node(node) for node in self._hosts] - self._cpus = [node.num_cpus for node in self._nodes] - self._gpus = [node.num_gpus for node in self._nodes] - """List of hosts available in allocation""" self._free_hosts: t.Deque[str] = collections.deque(self._hosts) """List of hosts on which steps can be launched""" @@ -291,34 +288,6 @@ def current_time(self) -> float: """Current time for DragonBackend object, in seconds since the Epoch""" return time.time() - def _can_honor_policy( - self, request: DragonRunRequest - ) -> t.Tuple[bool, t.Optional[str]]: - """Check if the policy can be honored with resources available - in the allocation. - :param request: DragonRunRequest containing policy information - :returns: Tuple indicating if the policy can be honored and - an optional error message""" - # ensure the policy can be honored - if request.policy: - if request.policy.cpu_affinity: - # make sure some node has enough CPUs - available = max(self._cpus) - requested = max(request.policy.cpu_affinity) - - if requested >= available: - return False, "Cannot satisfy request, not enough CPUs available" - - if request.policy.gpu_affinity: - # make sure some node has enough GPUs - available = max(self._gpus) - requested = max(request.policy.gpu_affinity) - - if requested >= available: - return False, "Cannot satisfy request, not enough GPUs available" - - return True, None - def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str]]: """Check if request can be honored with resources available in the allocation. @@ -333,11 +302,6 @@ def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str] if self._shutdown_requested: message = "Cannot satisfy request, server is shutting down." return False, message - - honorable, err = self._can_honor_policy(request) - if not honorable: - return False, err - return True, None def _allocate_step( @@ -446,46 +410,6 @@ def infra_ddict(self) -> str: return str(self._infra_ddict.serialize()) - @staticmethod - def create_run_policy( - request: DragonRequest, node_name: str - ) -> "dragon_policy.Policy": - """Create a dragon Policy from the request and node name - :param request: DragonRunRequest containing policy information - :param node_name: Name of the node on which the process will run - :returns: dragon_policy.Policy object mapped from request properties""" - if isinstance(request, DragonRunRequest): - run_request: DragonRunRequest = request - - affinity = dragon_policy.Policy.Affinity.DEFAULT - cpu_affinity: t.List[int] = [] - gpu_affinity: t.List[int] = [] - - # Customize policy only if the client requested it, otherwise use default - if run_request.policy is not None: - # Affinities are not mutually exclusive. If specified, both are used - if run_request.policy.cpu_affinity: - affinity = dragon_policy.Policy.Affinity.SPECIFIC - cpu_affinity = run_request.policy.cpu_affinity - - if run_request.policy.gpu_affinity: - affinity = dragon_policy.Policy.Affinity.SPECIFIC - gpu_affinity = run_request.policy.gpu_affinity - - if affinity != dragon_policy.Policy.Affinity.DEFAULT: - return dragon_policy.Policy( - placement=dragon_policy.Policy.Placement.HOST_NAME, - host_name=node_name, - affinity=affinity, - cpu_affinity=cpu_affinity, - gpu_affinity=gpu_affinity, - ) - - return dragon_policy.Policy( - placement=dragon_policy.Policy.Placement.HOST_NAME, - host_name=node_name, - ) - def _start_steps(self) -> None: self._heartbeat() with self._queue_lock: @@ -508,7 +432,10 @@ def _start_steps(self) -> None: policies = [] for node_name in hosts: - local_policy = self.create_run_policy(request, node_name) + local_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=node_name, + ) policies.extend([local_policy] * request.tasks_per_node) tmp_proc = dragon_process.ProcessTemplate( target=request.exe, diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index 9078fed54f..17b47e3090 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -29,8 +29,6 @@ import os import typing as t -from smartsim._core.schemas.dragonRequests import DragonRunPolicy - from ...._core.launcher.stepMapping import StepMap from ....error import LauncherError, SmartSimError from ....log import get_logger @@ -170,9 +168,6 @@ def run(self, step: Step) -> t.Optional[str]: merged_env = self._connector.merge_persisted_env(os.environ.copy()) nodes = int(run_args.get("nodes", None) or 1) tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) - - policy = DragonRunPolicy.from_run_args(run_args) - response = _assert_schema_type( self._connector.send_request( DragonRunRequest( @@ -186,7 +181,6 @@ def run(self, step: Step) -> t.Optional[str]: current_env=merged_env, output_file=out, error_file=err, - policy=policy, ) ), DragonRunResponse, diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py index dd93d7910c..036a9e5654 100644 --- a/smartsim/_core/launcher/step/dragonStep.py +++ b/smartsim/_core/launcher/step/dragonStep.py @@ -30,11 +30,7 @@ import sys import typing as t -from ...._core.schemas.dragonRequests import ( - DragonRunPolicy, - DragonRunRequest, - request_registry, -) +from ...._core.schemas.dragonRequests import DragonRunRequest, request_registry from ....error.errors import SSUnsupportedError from ....log import get_logger from ....settings import ( @@ -170,11 +166,8 @@ def _write_request_file(self) -> str: nodes = int(run_args.get("nodes", None) or 1) tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) - policy = DragonRunPolicy.from_run_args(run_args) - cmd = step.get_launch_cmd() out, err = step.get_output_files() - request = DragonRunRequest( exe=cmd[0], exe_args=cmd[1:], @@ -186,7 +179,6 @@ def _write_request_file(self) -> str: current_env=os.environ, output_file=out, error_file=err, - policy=policy, ) requests.append(request_registry.to_string(request)) with open(request_file, "w", encoding="utf-8") as script_file: diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index 171254e32a..2cce6e6107 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -26,7 +26,6 @@ from __future__ import annotations -import copy import functools import os.path as osp import pathlib @@ -52,7 +51,7 @@ def __init__(self, name: str, cwd: str, step_settings: SettingsBase) -> None: self.entity_name = name self.cwd = cwd self.managed = False - self.step_settings = copy.deepcopy(step_settings) + self.step_settings = step_settings self.meta: t.Dict[str, str] = {} @property diff --git a/smartsim/_core/schemas/dragonRequests.py b/smartsim/_core/schemas/dragonRequests.py index 487ea915a0..3e384f746a 100644 --- a/smartsim/_core/schemas/dragonRequests.py +++ b/smartsim/_core/schemas/dragonRequests.py @@ -26,10 +26,9 @@ import typing as t -from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt, ValidationError +from pydantic import BaseModel, Field, PositiveInt import smartsim._core.schemas.utils as _utils -from smartsim.error.errors import SmartSimError # Black and Pylint disagree about where to put the `...` # pylint: disable=multiple-statements @@ -40,43 +39,6 @@ class DragonRequest(BaseModel): ... -class DragonRunPolicy(BaseModel): - """Policy specifying hardware constraints when running a Dragon job""" - - cpu_affinity: t.List[NonNegativeInt] = Field(default_factory=list) - """List of CPU indices to which the job should be pinned""" - gpu_affinity: t.List[NonNegativeInt] = Field(default_factory=list) - """List of GPU indices to which the job should be pinned""" - - @staticmethod - def from_run_args( - run_args: t.Dict[str, t.Union[int, str, float, None]] - ) -> "DragonRunPolicy": - """Create a DragonRunPolicy with hardware constraints passed from - a dictionary of run arguments - :param run_args: Dictionary of run arguments - :returns: DragonRunPolicy instance created from the run arguments""" - gpu_args = "" - if gpu_arg_value := run_args.get("gpu-affinity", None): - gpu_args = str(gpu_arg_value) - - cpu_args = "" - if cpu_arg_value := run_args.get("cpu-affinity", None): - cpu_args = str(cpu_arg_value) - - # run args converted to a string must be split back into a list[int] - gpu_affinity = [int(x.strip()) for x in gpu_args.split(",") if x] - cpu_affinity = [int(x.strip()) for x in cpu_args.split(",") if x] - - try: - return DragonRunPolicy( - cpu_affinity=cpu_affinity, - gpu_affinity=gpu_affinity, - ) - except ValidationError as ex: - raise SmartSimError("Unable to build DragonRunPolicy") from ex - - class DragonRunRequestView(DragonRequest): exe: t.Annotated[str, Field(min_length=1)] exe_args: t.List[t.Annotated[str, Field(min_length=1)]] = [] @@ -95,7 +57,6 @@ class DragonRunRequestView(DragonRequest): @request_registry.register("run") class DragonRunRequest(DragonRunRequestView): current_env: t.Dict[str, t.Optional[str]] = {} - policy: t.Optional[DragonRunPolicy] = None def __str__(self) -> str: return str(DragonRunRequestView.parse_obj(self.dict(exclude={"current_env"}))) diff --git a/smartsim/settings/dragonRunSettings.py b/smartsim/settings/dragonRunSettings.py index 69a91547e7..b8baa4708c 100644 --- a/smartsim/settings/dragonRunSettings.py +++ b/smartsim/settings/dragonRunSettings.py @@ -28,8 +28,6 @@ import typing as t -from typing_extensions import override - from ..log import get_logger from .base import RunSettings @@ -65,7 +63,6 @@ def __init__( **kwargs, ) - @override def set_nodes(self, nodes: int) -> None: """Set the number of nodes @@ -73,38 +70,9 @@ def set_nodes(self, nodes: int) -> None: """ self.run_args["nodes"] = nodes - @override def set_tasks_per_node(self, tasks_per_node: int) -> None: """Set the number of tasks for this job :param tasks_per_node: number of tasks per node """ self.run_args["tasks-per-node"] = tasks_per_node - - @override - def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None: - """Specify the node feature for this job - - :param feature_list: a collection of strings representing the required - node features. Currently supported node features are: "gpu" - """ - if isinstance(feature_list, str): - feature_list = feature_list.strip().split() - elif not all(isinstance(feature, str) for feature in feature_list): - raise TypeError("feature_list must be string or list of strings") - - self.run_args["node-feature"] = ",".join(feature_list) - - def set_cpu_affinity(self, devices: t.List[int]) -> None: - """Set the CPU affinity for this job - - :param devices: list of CPU indices to execute on - """ - self.run_args["cpu-affinity"] = ",".join(str(device) for device in devices) - - def set_gpu_affinity(self, devices: t.List[int]) -> None: - """Set the GPU affinity for this job - - :param devices: list of GPU indices to execute on. - """ - self.run_args["gpu-affinity"] = ",".join(str(device) for device in devices) diff --git a/tests/test_dragon_run_request.py b/tests/test_dragon_backend.py similarity index 64% rename from tests/test_dragon_run_request.py rename to tests/test_dragon_backend.py index 94c17c222a..f284f38d99 100644 --- a/tests/test_dragon_run_request.py +++ b/tests/test_dragon_backend.py @@ -31,17 +31,19 @@ from unittest.mock import MagicMock import pytest -from pydantic import ValidationError # The tests in this file belong to the group_b group -pytestmark = pytest.mark.group_b +pytestmark = pytest.mark.group_a try: import dragon - - dragon_loaded = True -except: - dragon_loaded = False +except ImportError: + pass +else: + pytest.skip( + reason="Using dragon as launcher, not running Dragon unit tests", + allow_module_level=True, + ) from smartsim._core.config import CONFIG from smartsim._core.schemas.dragonRequests import * @@ -57,36 +59,10 @@ class NodeMock(MagicMock): - def __init__( - self, name: t.Optional[str] = None, num_gpus: int = 2, num_cpus: int = 8 - ) -> None: - super().__init__() - self._mock_id = name - NodeMock._num_gpus = num_gpus - NodeMock._num_cpus = num_cpus - @property def hostname(self) -> str: - if self._mock_id: - return self._mock_id return create_short_id_str() - @property - def num_cpus(self) -> str: - return NodeMock._num_cpus - - @property - def num_gpus(self) -> str: - return NodeMock._num_gpus - - def _set_id(self, value: str) -> None: - self._mock_id = value - - def gpus(self, parent: t.Any = None) -> t.List[str]: - if self._num_gpus: - return [f"{self.hostname}-gpu{i}" for i in range(NodeMock._num_gpus)] - return [] - class GroupStateMock(MagicMock): def Running(self) -> MagicMock: @@ -102,19 +78,13 @@ class ProcessGroupMock(MagicMock): puids = [121, 122] -def node_mock() -> NodeMock: - return NodeMock() - - -def get_mock_backend( - monkeypatch: pytest.MonkeyPatch, num_gpus: int = 2 -) -> "DragonBackend": +def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": process_mock = MagicMock(returncode=0) process_group_mock = MagicMock(**{"Process.return_value": ProcessGroupMock()}) process_module_mock = MagicMock() process_module_mock.Process = process_mock - node_mock = NodeMock(num_gpus=num_gpus) + node_mock = NodeMock() system_mock = MagicMock(nodes=["node1", "node2", "node3"]) monkeypatch.setitem( sys.modules, @@ -229,7 +199,6 @@ def set_mock_group_infos( return group_infos -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_handshake_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) @@ -240,7 +209,6 @@ def test_handshake_request(monkeypatch: pytest.MonkeyPatch) -> None: assert handshake_resp.dragon_pid == 99999 -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) run_req = DragonRunRequest( @@ -291,7 +259,6 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: assert not dragon_backend._running_steps -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_deny_run_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) @@ -317,78 +284,6 @@ def test_deny_run_request(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend.group_infos[step_id].status == SmartSimStatus.STATUS_FAILED -def test_run_request_with_empty_policy(monkeypatch: pytest.MonkeyPatch) -> None: - """Verify that a policy is applied to a run request""" - dragon_backend = get_mock_backend(monkeypatch) - run_req = DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - policy=None, - ) - assert run_req.policy is None - - -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") -def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None: - """Verify that a policy is applied to a run request""" - dragon_backend = get_mock_backend(monkeypatch) - run_req = DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - policy=DragonRunPolicy(cpu_affinity=[0, 1]), - ) - - run_resp = dragon_backend.process_request(run_req) - assert isinstance(run_resp, DragonRunResponse) - - step_id = run_resp.step_id - assert dragon_backend._queued_steps[step_id] == run_req - - mock_process_group = MagicMock(puids=[123, 124]) - - dragon_backend._group_infos[step_id].process_group = mock_process_group - dragon_backend._group_infos[step_id].puids = [123, 124] - dragon_backend._start_steps() - - assert dragon_backend._running_steps == [step_id] - assert len(dragon_backend._queued_steps) == 0 - assert len(dragon_backend._free_hosts) == 1 - assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id - assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id - - monkeypatch.setattr( - dragon_backend._group_infos[step_id].process_group, "status", "Running" - ) - - dragon_backend._update() - - assert dragon_backend._running_steps == [step_id] - assert len(dragon_backend._queued_steps) == 0 - assert len(dragon_backend._free_hosts) == 1 - assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id - assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id - - dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED - - dragon_backend._update() - assert not dragon_backend._running_steps - - -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_udpate_status_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) @@ -405,7 +300,6 @@ def test_udpate_status_request(monkeypatch: pytest.MonkeyPatch) -> None: } -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_stop_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) group_infos = set_mock_group_infos(monkeypatch, dragon_backend) @@ -437,7 +331,6 @@ def test_stop_request(monkeypatch: pytest.MonkeyPatch) -> None: assert len(dragon_backend._free_hosts) == 3 -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize( "immediate, kill_jobs, frontend_shutdown", [ @@ -496,7 +389,6 @@ def test_shutdown_request( assert dragon_backend._has_cooled_down == kill_jobs -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize("telemetry_flag", ["0", "1"]) def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) -> None: monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", telemetry_flag) @@ -512,7 +404,6 @@ def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) - assert dragon_backend.cooldown_period == expected_cooldown -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_heartbeat_and_time(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) first_heartbeat = dragon_backend.last_heartbeat @@ -521,7 +412,6 @@ def test_heartbeat_and_time(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend.last_heartbeat > first_heartbeat -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize("num_nodes", [1, 3, 100]) def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None: dragon_backend = get_mock_backend(monkeypatch) @@ -542,119 +432,6 @@ def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None: ) -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") -@pytest.mark.parametrize("affinity", [[0], [0, 1], list(range(8))]) -def test_can_honor_cpu_affinity( - monkeypatch: pytest.MonkeyPatch, affinity: t.List[int] -) -> None: - """Verify that valid CPU affinities are accepted""" - dragon_backend = get_mock_backend(monkeypatch) - run_req = DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - policy=DragonRunPolicy(cpu_affinity=affinity), - ) - - assert dragon_backend._can_honor(run_req)[0] - - -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") -def test_can_honor_cpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) -> None: - """Verify that invalid CPU affinities are NOT accepted - NOTE: negative values are captured by the Pydantic schema""" - dragon_backend = get_mock_backend(monkeypatch) - run_req = DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - policy=DragonRunPolicy(cpu_affinity=list(range(9))), - ) - - assert not dragon_backend._can_honor(run_req)[0] - - -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") -@pytest.mark.parametrize("affinity", [[0], [0, 1]]) -def test_can_honor_gpu_affinity( - monkeypatch: pytest.MonkeyPatch, affinity: t.List[int] -) -> None: - """Verify that valid GPU affinities are accepted""" - dragon_backend = get_mock_backend(monkeypatch) - run_req = DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - policy=DragonRunPolicy(gpu_affinity=affinity), - ) - - assert dragon_backend._can_honor(run_req)[0] - - -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") -def test_can_honor_gpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) -> None: - """Verify that invalid GPU affinities are NOT accepted - NOTE: negative values are captured by the Pydantic schema""" - dragon_backend = get_mock_backend(monkeypatch) - run_req = DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - policy=DragonRunPolicy(gpu_affinity=list(range(3))), - ) - - assert not dragon_backend._can_honor(run_req)[0] - - -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") -def test_can_honor_gpu_device_not_available(monkeypatch: pytest.MonkeyPatch) -> None: - """Verify that a request for a GPU if none exists is not accepted""" - - # create a mock node class that always reports no GPUs available - dragon_backend = get_mock_backend(monkeypatch, num_gpus=0) - - run_req = DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - # specify GPU device w/no affinity - policy=DragonRunPolicy(gpu_affinity=[0]), - ) - - assert not dragon_backend._can_honor(run_req)[0] - - -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_get_id(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) step_id = next(dragon_backend._step_ids) @@ -663,7 +440,6 @@ def test_get_id(monkeypatch: pytest.MonkeyPatch) -> None: assert step_id != next(dragon_backend._step_ids) -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_view(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) set_mock_group_infos(monkeypatch, dragon_backend) @@ -671,21 +447,17 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None: expected_message = textwrap.dedent(f"""\ Dragon server backend update - | Host | Status | - |--------|----------| + | Host | Status | + |---------|----------| | {hosts[0]} | Busy | | {hosts[1]} | Free | | {hosts[2]} | Free | | Step | Status | Hosts | Return codes | Num procs | - |----------|--------------|-------------|----------------|-------------| + |----------|--------------|-----------------|----------------|-------------| | abc123-1 | Running | {hosts[0]} | | 1 | | del999-2 | Cancelled | {hosts[1]} | -9 | 1 | | c101vz-3 | Completed | {hosts[1]},{hosts[2]} | 0 | 2 | | 0ghjk1-4 | Failed | {hosts[2]} | -1 | 1 | | ljace0-5 | NeverStarted | | | 0 |""") - # get rid of white space to make the comparison easier - actual_msg = dragon_backend.status_message.replace(" ", "") - expected_message = expected_message.replace(" ", "") - - assert actual_msg == expected_message + assert dragon_backend.status_message == expected_message diff --git a/tests/test_dragon_client.py b/tests/test_dragon_client.py deleted file mode 100644 index 80257b6107..0000000000 --- a/tests/test_dragon_client.py +++ /dev/null @@ -1,192 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2024, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import os -import pathlib -import typing as t -from unittest.mock import MagicMock - -import pytest - -from smartsim._core.launcher.step.dragonStep import DragonBatchStep, DragonStep -from smartsim.settings import DragonRunSettings -from smartsim.settings.slurmSettings import SbatchSettings - -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_a - - -import smartsim._core.entrypoints.dragon_client as dragon_client -from smartsim._core.schemas.dragonRequests import * -from smartsim._core.schemas.dragonResponses import * - - -@pytest.fixture -def dragon_batch_step(test_dir: str) -> "DragonBatchStep": - """Fixture for creating a default batch of steps for a dragon launcher""" - test_path = pathlib.Path(test_dir) - - batch_step_name = "batch_step" - num_nodes = 4 - batch_settings = SbatchSettings(nodes=num_nodes) - batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) - - # ensure the status_dir is set - status_dir = (test_path / ".smartsim" / "logs").as_posix() - batch_step.meta["status_dir"] = status_dir - - # create some steps to verify the requests file output changes - rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) - rs1 = DragonRunSettings(exe="sleep", exe_args=["2"]) - rs2 = DragonRunSettings(exe="sleep", exe_args=["3"]) - rs3 = DragonRunSettings(exe="sleep", exe_args=["4"]) - - names = "test00", "test01", "test02", "test03" - settings = rs0, rs1, rs2, rs3 - - # create steps with: - # no affinity, cpu affinity only, gpu affinity only, cpu and gpu affinity - cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] - gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] - - # assign some unique affinities to each run setting instance - for index, rs in enumerate(settings): - if gpu_affinities[index]: - rs.set_node_feature("gpu") - rs.set_cpu_affinity(cpu_affinities[index]) - rs.set_gpu_affinity(gpu_affinities[index]) - - steps = list( - DragonStep(name_, test_dir, rs_) for name_, rs_ in zip(names, settings) - ) - - for index, step in enumerate(steps): - # ensure meta is configured... - step.meta["status_dir"] = status_dir - # ... and put all the steps into the batch - batch_step.add_to_batch(steps[index]) - - return batch_step - - -def get_request_path_from_batch_script(launch_cmd: t.List[str]) -> pathlib.Path: - """Helper method for finding the path to a request file from the launch command""" - script_path = pathlib.Path(launch_cmd[-1]) - batch_script = script_path.read_text(encoding="utf-8") - batch_statements = [line for line in batch_script.split("\n") if line] - entrypoint_cmd = batch_statements[-1] - requests_file = pathlib.Path(entrypoint_cmd.split()[-1]) - return requests_file - - -def test_dragon_client_main_no_arg(monkeypatch: pytest.MonkeyPatch): - """Verify the client fails when the path to a submission file is not provided.""" - with pytest.raises(SystemExit): - dragon_client.cleanup = MagicMock() - dragon_client.main([]) - - # arg parser failures occur before resource allocation and should - # not result in resource cleanup being called - assert not dragon_client.cleanup.called - - -def test_dragon_client_main_empty_arg(test_dir: str): - """Verify the client fails when the path to a submission file is empty.""" - - with pytest.raises(ValueError) as ex: - dragon_client.cleanup = MagicMock() - dragon_client.main(["+submit", ""]) - - # verify it's a value error related to submit argument - assert "file not provided" in ex.value.args[0] - - # arg parser failures occur before resource allocation and should - # not result in resource cleanup being called - assert not dragon_client.cleanup.called - - -def test_dragon_client_main_bad_arg(test_dir: str): - """Verify the client returns a failure code when the path to a submission file is - invalid and does not raise an exception""" - path = pathlib.Path(test_dir) / "nonexistent_file.json" - - dragon_client.cleanup = MagicMock() - return_code = dragon_client.main(["+submit", str(path)]) - - # ensure non-zero return code - assert return_code != 0 - - # ensure failures do not block resource cleanup - assert dragon_client.cleanup.called - - -def test_dragon_client_main( - dragon_batch_step: DragonBatchStep, monkeypatch: pytest.MonkeyPatch -): - """Verify the client returns a failure code when the path to a submission file is - invalid and does not raise an exception""" - launch_cmd = dragon_batch_step.get_launch_cmd() - path = get_request_path_from_batch_script(launch_cmd) - num_requests_in_batch = 4 - num_shutdown_requests = 1 - request_count = num_requests_in_batch + num_shutdown_requests - submit_value = str(path) - - mock_connector = MagicMock() # DragonConnector - mock_connector.is_connected = True - mock_connector.send_request.return_value = DragonRunResponse(step_id="mock_step_id") - # mock can_monitor to exit before the infinite loop checking for shutdown - mock_connector.can_monitor = False - - mock_connector_class = MagicMock() - mock_connector_class.return_value = mock_connector - - # with monkeypatch.context() as ctx: - dragon_client.DragonConnector = mock_connector_class - dragon_client.cleanup = MagicMock() - - return_code = dragon_client.main(["+submit", submit_value]) - - # verify each request in the request file was processed - assert mock_connector.send_request.call_count == request_count - - # we know the batch fixture has a step with no affinity args supplied. skip it - for i in range(1, num_requests_in_batch): - sent_args = mock_connector.send_request.call_args_list[i][0] - request_arg = sent_args[0] - - assert isinstance(request_arg, DragonRunRequest) - - policy = request_arg.policy - - # make sure each policy has been read in correctly with valid affinity indices - assert len(policy.cpu_affinity) == len(set(policy.cpu_affinity)) - assert len(policy.gpu_affinity) == len(set(policy.gpu_affinity)) - - # we get a non-zero due to avoiding the infinite loop. consider refactoring - assert return_code == os.EX_IOERR - - # ensure failures do not block resource cleanup - assert dragon_client.cleanup.called diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index 4fe8bf71b4..ee0fcb14b7 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -31,7 +31,6 @@ import sys import time import typing as t -from unittest.mock import MagicMock import pytest import zmq @@ -39,74 +38,15 @@ import smartsim._core.config from smartsim._core._cli.scripts.dragon_install import create_dotenv from smartsim._core.config.config import get_config -from smartsim._core.launcher.dragon.dragonLauncher import ( - DragonConnector, - DragonLauncher, -) +from smartsim._core.launcher.dragon.dragonLauncher import DragonConnector from smartsim._core.launcher.dragon.dragonSockets import ( get_authenticator, get_secure_socket, ) -from smartsim._core.launcher.step.dragonStep import DragonBatchStep, DragonStep from smartsim._core.schemas.dragonRequests import DragonBootstrapRequest -from smartsim._core.schemas.dragonResponses import ( - DragonHandshakeResponse, - DragonRunResponse, -) +from smartsim._core.schemas.dragonResponses import DragonHandshakeResponse from smartsim._core.utils.network import IFConfig, find_free_port from smartsim._core.utils.security import KeyManager -from smartsim.error.errors import LauncherError -from smartsim.settings.dragonRunSettings import DragonRunSettings -from smartsim.settings.slurmSettings import SbatchSettings - - -@pytest.fixture -def dragon_batch_step(test_dir: str) -> DragonBatchStep: - """Fixture for creating a default batch of steps for a dragon launcher""" - test_path = pathlib.Path(test_dir) - - batch_step_name = "batch_step" - num_nodes = 4 - batch_settings = SbatchSettings(nodes=num_nodes) - batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) - - # ensure the status_dir is set - status_dir = (test_path / ".smartsim" / "logs").as_posix() - batch_step.meta["status_dir"] = status_dir - - # create some steps to verify the requests file output changes - rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) - rs1 = DragonRunSettings(exe="sleep", exe_args=["2"]) - rs2 = DragonRunSettings(exe="sleep", exe_args=["3"]) - rs3 = DragonRunSettings(exe="sleep", exe_args=["4"]) - - names = "test00", "test01", "test02", "test03" - settings = rs0, rs1, rs2, rs3 - - # create steps with: - # no affinity, cpu affinity only, gpu affinity only, cpu and gpu affinity - cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] - gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] - - # assign some unique affinities to each run setting instance - for index, rs in enumerate(settings): - if gpu_affinities[index]: - rs.set_node_feature("gpu") - rs.set_cpu_affinity(cpu_affinities[index]) - rs.set_gpu_affinity(gpu_affinities[index]) - - steps = list( - DragonStep(name_, test_dir, rs_) for name_, rs_ in zip(names, settings) - ) - - for index, step in enumerate(steps): - # ensure meta is configured... - step.meta["status_dir"] = status_dir - # ... and put all the steps into the batch - batch_step.add_to_batch(steps[index]) - - return batch_step - # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -581,162 +521,3 @@ def test_merge_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): # any non-dragon keys that didn't exist avoid unnecessary prepending assert merged_env[non_dragon_key] == non_dragon_value - - -def test_run_step_fail(test_dir: str) -> None: - """Verify that the dragon launcher still returns the step id - when the running step fails""" - test_path = pathlib.Path(test_dir) - status_dir = (test_path / ".smartsim" / "logs").as_posix() - - rs = DragonRunSettings(exe="sleep", exe_args=["1"]) - step0 = DragonStep("step0", test_dir, rs) - step0.meta["status_dir"] = status_dir - - mock_connector = MagicMock() # DragonConnector() - mock_connector.is_connected = True - mock_connector.send_request = MagicMock( - return_value=DragonRunResponse(step_id=step0.name, error_message="mock fail!") - ) - - launcher = DragonLauncher() - launcher._connector = mock_connector - - result = launcher.run(step0) - - # verify the failed step name is in the result - assert step0.name in result - - -def test_run_step_batch_empty(dragon_batch_step: DragonBatchStep) -> None: - """Verify that the dragon launcher behaves when asked to execute - a batch step that has no sub-steps""" - # remove the steps added in the batch fixture - dragon_batch_step.steps.clear() - - mock_step_id = "MOCK-STEPID" - mock_connector = MagicMock() # DragonConnector() - mock_connector.is_connected = True - mock_connector.send_request = MagicMock( - return_value=DragonRunResponse( - step_id=dragon_batch_step.name, error_message="mock fail!" - ) - ) - - launcher = DragonLauncher() - launcher._connector = mock_connector - launcher.task_manager.start_and_wait = MagicMock(return_value=(0, mock_step_id, "")) - - result = launcher.run(dragon_batch_step) - - # verify a step name is returned - assert result - # verify the batch step name is not in the result (renamed to SLURM-*) - assert dragon_batch_step.name not in result - - send_invocation = mock_connector.send_request - - # verify a batch request is not sent through the dragon connector - send_invocation.assert_not_called() - - -def test_run_step_batch_failure(dragon_batch_step: DragonBatchStep) -> None: - """Verify that the dragon launcher sends returns the step id - when the running step fails""" - mock_connector = MagicMock() # DragonConnector() - mock_connector.is_connected = True - mock_connector.send_request = MagicMock( - return_value=DragonRunResponse( - step_id=dragon_batch_step.name, error_message="mock fail!" - ) - ) - - mock_step_id = "MOCK-STEPID" - error_msg = "DOES_NOT_COMPUTE!" - launcher = DragonLauncher() - launcher._connector = mock_connector - launcher.task_manager.start_and_wait = MagicMock( - return_value=(1, mock_step_id, error_msg) - ) - - # a non-zero return code from the batch script should raise an error - with pytest.raises(LauncherError) as ex: - launcher.run(dragon_batch_step) - - # verify the correct error message is in the exception - assert error_msg in ex.value.args[0] - - -def test_run_step_success(test_dir: str) -> None: - """Verify that the dragon launcher sends the correctly formatted request for a step""" - test_path = pathlib.Path(test_dir) - status_dir = (test_path / ".smartsim" / "logs").as_posix() - - rs = DragonRunSettings(exe="sleep", exe_args=["1"]) - step0 = DragonStep("step0", test_dir, rs) - step0.meta["status_dir"] = status_dir - - mock_connector = MagicMock() # DragonConnector() - mock_connector.is_connected = True - mock_connector.send_request = MagicMock( - return_value=DragonRunResponse(step_id=step0.name) - ) - - launcher = DragonLauncher() - launcher._connector = mock_connector - - result = launcher.run(step0) - - # verify the successfully executed step name is in the result - assert step0.name in result - - # verify the DragonRunRequest sent matches all expectations - send_invocation = mock_connector.send_request - send_invocation.assert_called_once() - - args = send_invocation.call_args[0] # call_args == t.Tuple[args, kwargs] - - dragon_run_request = args[0] - req_name = dragon_run_request.name # name sent to dragon env - assert req_name.startswith(step0.name) - - req_policy_cpu_affinity = dragon_run_request.policy.cpu_affinity - assert not req_policy_cpu_affinity # default should be empty list - - req_policy_gpu_affinity = dragon_run_request.policy.gpu_affinity - assert not req_policy_gpu_affinity # default should be empty list - - -def test_run_step_success_batch( - monkeypatch: pytest.MonkeyPatch, dragon_batch_step: DragonBatchStep -) -> None: - """Verify that the dragon launcher sends the correctly formatted request - for a batch step""" - mock_connector = MagicMock() # DragonConnector() - mock_connector.is_connected = True - mock_connector.send_request = MagicMock( - return_value=DragonRunResponse(step_id=dragon_batch_step.name) - ) - - launcher = DragonLauncher() - launcher._connector = mock_connector - launcher.task_manager.start_and_wait = MagicMock(return_value=(0, "success", "")) - - result = launcher.run(dragon_batch_step) - - # verify the successfully executed step name is in the result - assert dragon_batch_step.name not in result - assert result - - send_invocation = mock_connector.send_request - - # verify a batch request is not sent through the dragon connector - send_invocation.assert_not_called() - launcher.task_manager.start_and_wait.assert_called_once() - - args = launcher.task_manager.start_and_wait.call_args[0] - - # verify the batch script is executed - launch_cmd = dragon_batch_step.get_launch_cmd() - for stmt in launch_cmd: - assert stmt in args[0] # args[0] is the cmd list sent to subprocess.Popen diff --git a/tests/test_dragon_run_policy.py b/tests/test_dragon_run_policy.py deleted file mode 100644 index 1d8d069fab..0000000000 --- a/tests/test_dragon_run_policy.py +++ /dev/null @@ -1,371 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2024, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pathlib - -import pytest - -from smartsim._core.launcher.step.dragonStep import DragonBatchStep, DragonStep -from smartsim.settings.dragonRunSettings import DragonRunSettings -from smartsim.settings.slurmSettings import SbatchSettings - -try: - from dragon.infrastructure.policy import Policy - - import smartsim._core.entrypoints.dragon as drg - from smartsim._core.launcher.dragon.dragonBackend import DragonBackend - - dragon_loaded = True -except: - dragon_loaded = False - -# The tests in this file belong to the group_b group -pytestmark = pytest.mark.group_b - -from smartsim._core.schemas.dragonRequests import * -from smartsim._core.schemas.dragonResponses import * - - -@pytest.fixture -def dragon_batch_step(test_dir: str) -> "DragonBatchStep": - """Fixture for creating a default batch of steps for a dragon launcher""" - test_path = pathlib.Path(test_dir) - - batch_step_name = "batch_step" - num_nodes = 4 - batch_settings = SbatchSettings(nodes=num_nodes) - batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) - - # ensure the status_dir is set - status_dir = (test_path / ".smartsim" / "logs").as_posix() - batch_step.meta["status_dir"] = status_dir - - # create some steps to verify the requests file output changes - rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) - rs1 = DragonRunSettings(exe="sleep", exe_args=["2"]) - rs2 = DragonRunSettings(exe="sleep", exe_args=["3"]) - rs3 = DragonRunSettings(exe="sleep", exe_args=["4"]) - - names = "test00", "test01", "test02", "test03" - settings = rs0, rs1, rs2, rs3 - - # create steps with: - # no affinity, cpu affinity only, gpu affinity only, cpu and gpu affinity - cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] - gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] - - # assign some unique affinities to each run setting instance - for index, rs in enumerate(settings): - if gpu_affinities[index]: - rs.set_node_feature("gpu") - rs.set_cpu_affinity(cpu_affinities[index]) - rs.set_gpu_affinity(gpu_affinities[index]) - - steps = list( - DragonStep(name_, test_dir, rs_) for name_, rs_ in zip(names, settings) - ) - - for index, step in enumerate(steps): - # ensure meta is configured... - step.meta["status_dir"] = status_dir - # ... and put all the steps into the batch - batch_step.add_to_batch(steps[index]) - - return batch_step - - -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") -@pytest.mark.parametrize( - "dragon_request", - [ - pytest.param(DragonHandshakeRequest(), id="DragonHandshakeRequest"), - pytest.param(DragonShutdownRequest(), id="DragonShutdownRequest"), - pytest.param( - DragonBootstrapRequest(address="localhost"), id="DragonBootstrapRequest" - ), - ], -) -def test_create_run_policy_non_run_request(dragon_request: DragonRequest) -> None: - """Verify that a default policy is returned when a request is - not attempting to start a new proccess (e.g. a DragonRunRequest)""" - policy = DragonBackend.create_run_policy(dragon_request, "localhost") - - assert policy is not None, "Default policy was not returned" - assert ( - policy.device == Policy.Device.DEFAULT - ), "Default device was not Device.DEFAULT" - assert policy.cpu_affinity == [], "Default cpu affinity was not empty" - assert policy.gpu_affinity == [], "Default gpu affinity was not empty" - - -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") -def test_create_run_policy_run_request_no_run_policy() -> None: - """Verify that a policy specifying no policy is returned with all default - values (no device, empty cpu & gpu affinity)""" - run_req = DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - # policy= # <--- skipping this - ) - - policy = DragonBackend.create_run_policy(run_req, "localhost") - - assert policy.device == Policy.Device.DEFAULT - assert set(policy.cpu_affinity) == set() - assert policy.gpu_affinity == [] - assert policy.affinity == Policy.Affinity.DEFAULT - - -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") -def test_create_run_policy_run_request_default_run_policy() -> None: - """Verify that a policy specifying no affinity is returned with - default value for device and empty affinity lists""" - run_req = DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - policy=DragonRunPolicy(), # <--- passing default values - ) - - policy = DragonBackend.create_run_policy(run_req, "localhost") - - assert set(policy.cpu_affinity) == set() - assert set(policy.gpu_affinity) == set() - assert policy.affinity == Policy.Affinity.DEFAULT - - -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") -def test_create_run_policy_run_request_cpu_affinity_no_device() -> None: - """Verify that a input policy specifying a CPU affinity but lacking the device field - produces a Dragon Policy with the CPU device specified""" - affinity = set([0, 2, 4]) - run_req = DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - policy=DragonRunPolicy(cpu_affinity=list(affinity)), # <-- no device spec - ) - - policy = DragonBackend.create_run_policy(run_req, "localhost") - - assert set(policy.cpu_affinity) == affinity - assert policy.gpu_affinity == [] - assert policy.affinity == Policy.Affinity.SPECIFIC - - -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") -def test_create_run_policy_run_request_cpu_affinity() -> None: - """Verify that a policy specifying CPU affinity is returned as expected""" - affinity = set([0, 2, 4]) - run_req = DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - policy=DragonRunPolicy(cpu_affinity=list(affinity)), - ) - - policy = DragonBackend.create_run_policy(run_req, "localhost") - - assert set(policy.cpu_affinity) == affinity - assert policy.gpu_affinity == [] - assert policy.affinity == Policy.Affinity.SPECIFIC - - -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") -def test_create_run_policy_run_request_gpu_affinity() -> None: - """Verify that a policy specifying GPU affinity is returned as expected""" - affinity = set([0, 2, 4]) - run_req = DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - policy=DragonRunPolicy(device="gpu", gpu_affinity=list(affinity)), - ) - - policy = DragonBackend.create_run_policy(run_req, "localhost") - - assert policy.cpu_affinity == [] - assert set(policy.gpu_affinity) == set(affinity) - assert policy.affinity == Policy.Affinity.SPECIFIC - - -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") -def test_dragon_run_policy_from_run_args() -> None: - """Verify that a DragonRunPolicy is created from a dictionary of run arguments""" - run_args = { - "gpu-affinity": "0,1,2", - "cpu-affinity": "3,4,5,6", - } - - policy = DragonRunPolicy.from_run_args(run_args) - - assert policy.cpu_affinity == [3, 4, 5, 6] - assert policy.gpu_affinity == [0, 1, 2] - - -def test_dragon_run_policy_from_run_args_empty() -> None: - """Verify that a DragonRunPolicy is created from an empty - dictionary of run arguments""" - run_args = {} - - policy = DragonRunPolicy.from_run_args(run_args) - - assert policy.cpu_affinity == [] - assert policy.gpu_affinity == [] - - -def test_dragon_run_policy_from_run_args_cpu_affinity() -> None: - """Verify that a DragonRunPolicy is created from a dictionary - of run arguments containing a CPU affinity""" - run_args = { - "cpu-affinity": "3,4,5,6", - } - - policy = DragonRunPolicy.from_run_args(run_args) - - assert policy.cpu_affinity == [3, 4, 5, 6] - assert policy.gpu_affinity == [] - - -def test_dragon_run_policy_from_run_args_gpu_affinity() -> None: - """Verify that a DragonRunPolicy is created from a dictionary - of run arguments containing a GPU affinity""" - run_args = { - "gpu-affinity": "0, 1, 2", - } - - policy = DragonRunPolicy.from_run_args(run_args) - - assert policy.cpu_affinity == [] - assert policy.gpu_affinity == [0, 1, 2] - - -def test_dragon_run_policy_from_run_args_invalid_gpu_affinity() -> None: - """Verify that a DragonRunPolicy is NOT created from a dictionary - of run arguments with an invalid GPU affinity""" - run_args = { - "gpu-affinity": "0,-1,2", - } - - with pytest.raises(SmartSimError) as ex: - DragonRunPolicy.from_run_args(run_args) - - assert "DragonRunPolicy" in ex.value.args[0] - - -def test_dragon_run_policy_from_run_args_invalid_cpu_affinity() -> None: - """Verify that a DragonRunPolicy is NOT created from a dictionary - of run arguments with an invalid CPU affinity""" - run_args = { - "cpu-affinity": "3,4,5,-6", - } - - with pytest.raises(SmartSimError) as ex: - DragonRunPolicy.from_run_args(run_args) - - assert "DragonRunPolicy" in ex.value.args[0] - - -def test_dragon_run_policy_from_run_args_ignore_empties_gpu() -> None: - """Verify that a DragonRunPolicy is created from a dictionary - of run arguments and ignores empty values in the serialized gpu list""" - run_args = { - "gpu-affinity": "0,,2", - } - - policy = DragonRunPolicy.from_run_args(run_args) - - assert policy.cpu_affinity == [] - assert policy.gpu_affinity == [0, 2] - - -def test_dragon_run_policy_from_run_args_ignore_empties_cpu() -> None: - """Verify that a DragonRunPolicy is created from a dictionary - of run arguments and ignores empty values in the serialized cpu list""" - run_args = { - "cpu-affinity": "3,4,,6,", - } - - policy = DragonRunPolicy.from_run_args(run_args) - - assert policy.cpu_affinity == [3, 4, 6] - assert policy.gpu_affinity == [] - - -def test_dragon_run_policy_from_run_args_null_gpu_affinity() -> None: - """Verify that a DragonRunPolicy is created if a null value is encountered - in the gpu-affinity list""" - run_args = { - "gpu-affinity": None, - "cpu-affinity": "3,4,5,6", - } - - policy = DragonRunPolicy.from_run_args(run_args) - - assert policy.cpu_affinity == [3, 4, 5, 6] - assert policy.gpu_affinity == [] - - -def test_dragon_run_policy_from_run_args_null_cpu_affinity() -> None: - """Verify that a DragonRunPolicy is created if a null value is encountered - in the cpu-affinity list""" - run_args = {"gpu-affinity": "0,1,2", "cpu-affinity": None} - - policy = DragonRunPolicy.from_run_args(run_args) - - assert policy.cpu_affinity == [] - assert policy.gpu_affinity == [0, 1, 2] diff --git a/tests/test_dragon_run_request_nowlm.py b/tests/test_dragon_run_request_nowlm.py deleted file mode 100644 index afd25aa9d7..0000000000 --- a/tests/test_dragon_run_request_nowlm.py +++ /dev/null @@ -1,105 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2024, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest -from pydantic import ValidationError - -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_a - -from smartsim._core.schemas.dragonRequests import * -from smartsim._core.schemas.dragonResponses import * - - -def test_run_request_with_null_policy(monkeypatch: pytest.MonkeyPatch) -> None: - """Verify that an empty policy does not cause an error""" - # dragon_backend = get_mock_backend(monkeypatch) - run_req = DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - policy=None, - ) - assert run_req.policy is None - - -def test_run_request_with_empty_policy(monkeypatch: pytest.MonkeyPatch) -> None: - """Verify that a non-empty policy is set correctly""" - # dragon_backend = get_mock_backend(monkeypatch) - run_req = DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - policy=DragonRunPolicy(), - ) - assert run_req.policy is not None - assert not run_req.policy.cpu_affinity - assert not run_req.policy.gpu_affinity - - -@pytest.mark.parametrize( - "device,cpu_affinity,gpu_affinity", - [ - pytest.param("cpu", [-1], [], id="cpu_affinity"), - pytest.param("gpu", [], [-1], id="gpu_affinity"), - ], -) -def test_run_request_with_negative_affinity( - device: str, - cpu_affinity: t.List[int], - gpu_affinity: t.List[int], -) -> None: - """Verify that invalid affinity values fail validation""" - with pytest.raises(ValidationError) as ex: - DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - policy=DragonRunPolicy( - cpu_affinity=cpu_affinity, gpu_affinity=gpu_affinity - ), - ) - - assert f"{device}_affinity" in str(ex.value.args[0]) - assert "NumberNotGeError" in str(ex.value.args[0]) diff --git a/tests/test_dragon_runsettings.py b/tests/test_dragon_runsettings.py deleted file mode 100644 index 34e8510e82..0000000000 --- a/tests/test_dragon_runsettings.py +++ /dev/null @@ -1,98 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2024, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -from smartsim.settings import DragonRunSettings - -# The tests in this file belong to the group_b group -pytestmark = pytest.mark.group_a - - -def test_dragon_runsettings_nodes(): - """Verify that node count is set correctly""" - rs = DragonRunSettings(exe="sleep", exe_args=["1"]) - - exp_value = 3 - rs.set_nodes(exp_value) - assert rs.run_args["nodes"] == exp_value - - exp_value = 9 - rs.set_nodes(exp_value) - assert rs.run_args["nodes"] == exp_value - - -def test_dragon_runsettings_tasks_per_node(): - """Verify that tasks per node is set correctly""" - rs = DragonRunSettings(exe="sleep", exe_args=["1"]) - - exp_value = 3 - rs.set_tasks_per_node(exp_value) - assert rs.run_args["tasks-per-node"] == exp_value - - exp_value = 7 - rs.set_tasks_per_node(exp_value) - assert rs.run_args["tasks-per-node"] == exp_value - - -def test_dragon_runsettings_cpu_affinity(): - """Verify that the CPU affinity is set correctly""" - rs = DragonRunSettings(exe="sleep", exe_args=["1"]) - - exp_value = [0, 1, 2, 3] - rs.set_cpu_affinity([0, 1, 2, 3]) - assert rs.run_args["cpu-affinity"] == ",".join(str(val) for val in exp_value) - - # ensure the value is not changed when we extend the list - exp_value.extend([4, 5, 6]) - assert rs.run_args["cpu-affinity"] != ",".join(str(val) for val in exp_value) - - rs.set_cpu_affinity(exp_value) - assert rs.run_args["cpu-affinity"] == ",".join(str(val) for val in exp_value) - - # ensure the value is not changed when we extend the list - rs.run_args["cpu-affinity"] = "7,8,9" - assert rs.run_args["cpu-affinity"] != ",".join(str(val) for val in exp_value) - - -def test_dragon_runsettings_gpu_affinity(): - """Verify that the GPU affinity is set correctly""" - rs = DragonRunSettings(exe="sleep", exe_args=["1"]) - - exp_value = [0, 1, 2, 3] - rs.set_gpu_affinity([0, 1, 2, 3]) - assert rs.run_args["gpu-affinity"] == ",".join(str(val) for val in exp_value) - - # ensure the value is not changed when we extend the list - exp_value.extend([4, 5, 6]) - assert rs.run_args["gpu-affinity"] != ",".join(str(val) for val in exp_value) - - rs.set_gpu_affinity(exp_value) - assert rs.run_args["gpu-affinity"] == ",".join(str(val) for val in exp_value) - - # ensure the value is not changed when we extend the list - rs.run_args["gpu-affinity"] = "7,8,9" - assert rs.run_args["gpu-affinity"] != ",".join(str(val) for val in exp_value) diff --git a/tests/test_dragon_step.py b/tests/test_dragon_step.py deleted file mode 100644 index 19f408e0bd..0000000000 --- a/tests/test_dragon_step.py +++ /dev/null @@ -1,394 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2024, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -import pathlib -import shutil -import sys -import typing as t - -import pytest - -from smartsim._core.launcher.step.dragonStep import DragonBatchStep, DragonStep -from smartsim.settings import DragonRunSettings -from smartsim.settings.pbsSettings import QsubBatchSettings -from smartsim.settings.slurmSettings import SbatchSettings - -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_a - - -from smartsim._core.schemas.dragonRequests import * -from smartsim._core.schemas.dragonResponses import * - - -@pytest.fixture -def dragon_batch_step(test_dir: str) -> DragonBatchStep: - """Fixture for creating a default batch of steps for a dragon launcher""" - test_path = pathlib.Path(test_dir) - - batch_step_name = "batch_step" - num_nodes = 4 - batch_settings = SbatchSettings(nodes=num_nodes) - batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) - - # ensure the status_dir is set - status_dir = (test_path / ".smartsim" / "logs").as_posix() - batch_step.meta["status_dir"] = status_dir - - # create some steps to verify the requests file output changes - rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) - rs1 = DragonRunSettings(exe="sleep", exe_args=["2"]) - rs2 = DragonRunSettings(exe="sleep", exe_args=["3"]) - rs3 = DragonRunSettings(exe="sleep", exe_args=["4"]) - - names = "test00", "test01", "test02", "test03" - settings = rs0, rs1, rs2, rs3 - - # create steps with: - # no affinity, cpu affinity only, gpu affinity only, cpu and gpu affinity - cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] - gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] - - # assign some unique affinities to each run setting instance - for index, rs in enumerate(settings): - if gpu_affinities[index]: - rs.set_node_feature("gpu") - rs.set_cpu_affinity(cpu_affinities[index]) - rs.set_gpu_affinity(gpu_affinities[index]) - - steps = list( - DragonStep(name_, test_dir, rs_) for name_, rs_ in zip(names, settings) - ) - - for index, step in enumerate(steps): - # ensure meta is configured... - step.meta["status_dir"] = status_dir - # ... and put all the steps into the batch - batch_step.add_to_batch(steps[index]) - - return batch_step - - -def get_request_path_from_batch_script(launch_cmd: t.List[str]) -> pathlib.Path: - """Helper method for finding the path to a request file from the launch command""" - script_path = pathlib.Path(launch_cmd[-1]) - batch_script = script_path.read_text(encoding="utf-8") - batch_statements = [line for line in batch_script.split("\n") if line] - entrypoint_cmd = batch_statements[-1] - requests_file = pathlib.Path(entrypoint_cmd.split()[-1]) - return requests_file - - -def test_dragon_step_creation(test_dir: str) -> None: - """Verify that the step is created with the values provided""" - rs = DragonRunSettings(exe="sleep", exe_args=["1"]) - - original_name = "test" - step = DragonStep(original_name, test_dir, rs) - - # confirm the name has been made unique to avoid conflicts - assert step.name != original_name - assert step.entity_name == original_name - assert step.cwd == test_dir - assert step.step_settings is not None - - -def test_dragon_step_name_uniqueness(test_dir: str) -> None: - """Verify that step name is unique and independent of step content""" - - rs = DragonRunSettings(exe="sleep", exe_args=["1"]) - - original_name = "test" - - num_steps = 100 - steps = [DragonStep(original_name, test_dir, rs) for _ in range(num_steps)] - - # confirm the name has been made unique in each step - step_names = {step.name for step in steps} - assert len(step_names) == num_steps - - -def test_dragon_step_launch_cmd(test_dir: str) -> None: - """Verify the expected launch cmd is generated w/minimal settings""" - exp_exe = "sleep" - exp_exe_args = "1" - rs = DragonRunSettings(exe=exp_exe, exe_args=[exp_exe_args]) - - original_name = "test" - step = DragonStep(original_name, test_dir, rs) - - launch_cmd = step.get_launch_cmd() - assert len(launch_cmd) == 2 - - # we'll verify the exe_args and exe name are handled correctly - exe, args = launch_cmd - assert exp_exe in exe - assert exp_exe_args in args - - # also, verify that a string exe_args param instead of list is handled correctly - exp_exe_args = "1 2 3" - rs = DragonRunSettings(exe=exp_exe, exe_args=exp_exe_args) - step = DragonStep(original_name, test_dir, rs) - launch_cmd = step.get_launch_cmd() - assert len(launch_cmd) == 4 # "/foo/bar/sleep 1 2 3" - - -def test_dragon_step_launch_cmd_multi_arg(test_dir: str) -> None: - """Verify the expected launch cmd is generated when multiple arguments - are passed to run settings""" - exp_exe = "sleep" - arg0, arg1, arg2 = "1", "2", "3" - rs = DragonRunSettings(exe=exp_exe, exe_args=[arg0, arg1, arg2]) - - original_name = "test" - - step = DragonStep(original_name, test_dir, rs) - - launch_cmd = step.get_launch_cmd() - assert len(launch_cmd) == 4 - - exe, *args = launch_cmd - assert exp_exe in exe - assert arg0 in args - assert arg1 in args - assert arg2 in args - - -def test_dragon_step_launch_cmd_no_bash( - test_dir: str, monkeypatch: pytest.MonkeyPatch -) -> None: - """Verify that requirement for bash shell is checked""" - exp_exe = "sleep" - arg0, arg1, arg2 = "1", "2", "3" - rs = DragonRunSettings(exe=exp_exe, exe_args=[arg0, arg1, arg2]) - rs.colocated_db_settings = {"foo": "bar"} # triggers bash lookup - - original_name = "test" - step = DragonStep(original_name, test_dir, rs) - - with pytest.raises(RuntimeError) as ex, monkeypatch.context() as ctx: - ctx.setattr(shutil, "which", lambda _: None) - step.get_launch_cmd() - - # verify the exception thrown is the one we're looking for - assert "Could not find" in ex.value.args[0] - - -def test_dragon_step_colocated_db() -> None: - # todo: implement a test for the branch where bash is found and - # run_settings.colocated_db_settings is set - ... - - -def test_dragon_step_container() -> None: - # todo: implement a test for the branch where run_settings.container - # is an instance of class `Singularity` - ... - - -def test_dragon_step_run_settings_accessor(test_dir: str) -> None: - """Verify the run settings passed to the step are copied correctly and - are not inadvertently modified outside the step""" - exp_exe = "sleep" - arg0, arg1, arg2 = "1", "2", "3" - rs = DragonRunSettings(exe=exp_exe, exe_args=[arg0, arg1, arg2]) - - original_name = "test" - step = DragonStep(original_name, test_dir, rs) - rs_output = step.run_settings - - assert rs.exe == rs_output.exe - assert rs.exe_args == rs_output.exe_args - - # ensure we have a deep copy - rs.exe = "foo" - assert id(step.run_settings) != id(rs) - assert step.run_settings.exe != rs.exe - - -def test_dragon_batch_step_creation(test_dir: str) -> None: - """Verify that the batch step is created with the values provided""" - batch_step_name = "batch_step" - num_nodes = 4 - batch_settings = SbatchSettings(nodes=num_nodes) - batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) - - # confirm the name has been made unique to avoid conflicts - assert batch_step.name != batch_step_name - assert batch_step.entity_name == batch_step_name - assert batch_step.cwd == test_dir - assert batch_step.batch_settings is not None - assert batch_step.managed - - -def test_dragon_batch_step_add_to_batch(test_dir: str) -> None: - """Verify that steps are added to the batch correctly""" - rs = DragonRunSettings(exe="sleep", exe_args=["1"]) - - name0, name1, name2 = "test00", "test01", "test02" - step0 = DragonStep(name0, test_dir, rs) - step1 = DragonStep(name1, test_dir, rs) - step2 = DragonStep(name2, test_dir, rs) - - batch_step_name = "batch_step" - num_nodes = 4 - batch_settings = SbatchSettings(nodes=num_nodes) - batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) - - assert len(batch_step.steps) == 0 - - batch_step.add_to_batch(step0) - assert len(batch_step.steps) == 1 - assert name0 in ",".join({step.name for step in batch_step.steps}) - - batch_step.add_to_batch(step1) - assert len(batch_step.steps) == 2 - assert name1 in ",".join({step.name for step in batch_step.steps}) - - batch_step.add_to_batch(step2) - assert len(batch_step.steps) == 3 - assert name2 in ",".join({step.name for step in batch_step.steps}) - - -def test_dragon_batch_step_get_launch_command_meta_fail(test_dir: str) -> None: - """Verify that the batch launch command cannot be generated without - having the status directory set in the step metadata""" - batch_step_name = "batch_step" - num_nodes = 4 - batch_settings = SbatchSettings(nodes=num_nodes) - batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) - - with pytest.raises(KeyError) as ex: - batch_step.get_launch_cmd() - - -@pytest.mark.parametrize( - "batch_settings_class,batch_exe,batch_header,node_spec_tpl", - [ - pytest.param( - SbatchSettings, "sbatch", "#SBATCH", "#SBATCH --nodes={0}", id="sbatch" - ), - pytest.param(QsubBatchSettings, "qsub", "#PBS", "#PBS -l nodes={0}", id="qsub"), - ], -) -def test_dragon_batch_step_get_launch_command( - test_dir: str, - batch_settings_class: t.Type, - batch_exe: str, - batch_header: str, - node_spec_tpl: str, -) -> None: - """Verify that the batch launch command is properly generated and - the expected side effects are present (writing script file to disk)""" - test_path = pathlib.Path(test_dir) - - batch_step_name = "batch_step" - num_nodes = 4 - batch_settings = batch_settings_class(nodes=num_nodes) - batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) - - # ensure the status_dir is set - status_dir = (test_path / ".smartsim" / "logs").as_posix() - batch_step.meta["status_dir"] = status_dir - - launch_cmd = batch_step.get_launch_cmd() - assert launch_cmd - - full_cmd = " ".join(launch_cmd) - assert batch_exe in full_cmd # verify launcher running the batch - assert test_dir in full_cmd # verify outputs are sent to expected directory - assert "batch_step.sh" in full_cmd # verify batch script name is in the command - - # ...verify that the script file is written when getting the launch command - script_path = pathlib.Path(launch_cmd[-1]) - assert script_path.exists() - assert len(script_path.read_bytes()) > 0 - - batch_script = script_path.read_text(encoding="utf-8") - - # ...verify the script file has the expected batch script header content - assert batch_header in batch_script - assert node_spec_tpl.format(num_nodes) in batch_script # verify node count is set - - # ...verify the script has the expected entrypoint command - batch_statements = [line for line in batch_script.split("\n") if line] - python_path = sys.executable - - entrypoint_cmd = batch_statements[-1] - assert python_path in entrypoint_cmd - assert "smartsim._core.entrypoints.dragon_client +submit" in entrypoint_cmd - - -def test_dragon_batch_step_write_request_file_no_steps(test_dir: str) -> None: - """Verify that the batch launch command writes an appropriate request file - if no steps are attached""" - test_path = pathlib.Path(test_dir) - - batch_step_name = "batch_step" - num_nodes = 4 - batch_settings = SbatchSettings(nodes=num_nodes) - batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) - - # ensure the status_dir is set - status_dir = (test_path / ".smartsim" / "logs").as_posix() - batch_step.meta["status_dir"] = status_dir - - launch_cmd = batch_step.get_launch_cmd() - requests_file = get_request_path_from_batch_script(launch_cmd) - - # no steps have been added yet, so the requests file should be a serialized, empty list - assert requests_file.read_text(encoding="utf-8") == "[]" - - -def test_dragon_batch_step_write_request_file( - dragon_batch_step: DragonBatchStep, -) -> None: - """Verify that the batch launch command writes an appropriate request file - for the set of attached steps""" - # create steps with: - # no affinity, cpu affinity only, gpu affinity only, cpu and gpu affinity - cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] - gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] - - launch_cmd = dragon_batch_step.get_launch_cmd() - requests_file = get_request_path_from_batch_script(launch_cmd) - - requests_text = requests_file.read_text(encoding="utf-8") - requests_json: t.List[str] = json.loads(requests_text) - - # verify that there is an item in file for each step added to the batch - assert len(requests_json) == len(dragon_batch_step.steps) - - for index, req in enumerate(requests_json): - req_type, req_data = req.split("|", 1) - # the only steps added are to execute apps, requests should be of type "run" - assert req_type == "run" - - run_request = DragonRunRequest(**json.loads(req_data)) - assert run_request - assert run_request.policy.cpu_affinity == cpu_affinities[index] - assert run_request.policy.gpu_affinity == gpu_affinities[index] From b6c2f2baf526e766d5e09d8030699530e3aecf76 Mon Sep 17 00:00:00 2001 From: Chris McBride <3595025+ankona@users.noreply.github.com> Date: Thu, 18 Jul 2024 10:49:25 -0400 Subject: [PATCH 09/60] Merge latest develop into mli-feature (#640) [ committed by @ankona ] [ approved by @AlyssaCote ] --- doc/changelog.md | 1 + doc/dragon.rst | 28 ++ .../lattice/online_analysis.ipynb | 6 + .../_core/launcher/dragon/dragonBackend.py | 85 +++- .../_core/launcher/dragon/dragonLauncher.py | 6 + smartsim/_core/launcher/step/dragonStep.py | 10 +- smartsim/_core/launcher/step/step.py | 3 +- smartsim/_core/schemas/dragonRequests.py | 41 +- smartsim/settings/dragonRunSettings.py | 32 ++ tests/test_dragon_client.py | 192 +++++++++ tests/test_dragon_launcher.py | 223 +++++++++- tests/test_dragon_run_policy.py | 371 +++++++++++++++++ ..._backend.py => test_dragon_run_request.py} | 256 +++++++++++- tests/test_dragon_run_request_nowlm.py | 105 +++++ tests/test_dragon_runsettings.py | 98 +++++ tests/test_dragon_step.py | 394 ++++++++++++++++++ 16 files changed, 1826 insertions(+), 25 deletions(-) create mode 100644 tests/test_dragon_client.py create mode 100644 tests/test_dragon_run_policy.py rename tests/{test_dragon_backend.py => test_dragon_run_request.py} (64%) create mode 100644 tests/test_dragon_run_request_nowlm.py create mode 100644 tests/test_dragon_runsettings.py create mode 100644 tests/test_dragon_step.py diff --git a/doc/changelog.md b/doc/changelog.md index ee41fabf88..f4adf1c091 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -28,6 +28,7 @@ To be released at some future point in time Description +- Add hardware pinning capability when using dragon - Pin NumPy version to 1.x - New launcher support for SGE (and similar derivatives) - Fix test outputs being created in incorrect directory diff --git a/doc/dragon.rst b/doc/dragon.rst index 0bf6a8ea3c..e19b40e4b7 100644 --- a/doc/dragon.rst +++ b/doc/dragon.rst @@ -65,6 +65,34 @@ In the next sections, we detail how Dragon is integrated into SmartSim. For more information on HPC launchers, visit the :ref:`Run Settings` page. +Hardware Pinning +================ + +Dragon also enables users to specify hardware constraints using ``DragonRunSettings``. CPU +and GPU affinity can be specified using the ``DragonRunSettings`` object. The following +example demonstrates how to specify CPU affinity and GPU affinities simultaneously. Note +that affinities are passed as a list of device indices. + +.. code-block:: python + + # Because "dragon" was specified as the launcher during Experiment initialization, + # create_run_settings will return a DragonRunSettings object + rs = exp.create_run_settings(exe="mpi_app", + exe_args=["--option", "value"], + env_vars={"MYVAR": "VALUE"}) + + # Request the first 8 CPUs for this job + rs.set_cpu_affinity(list(range(9))) + + # Request the first two GPUs on the node for this job + rs.set_gpu_affinity([0, 1]) + +.. note:: + + SmartSim launches jobs in the order they are received on the first available + host in a round-robin pattern. To ensure a process is launched on a node with + specific features, configure a hostname constraint. + ================= The Dragon Server ================= diff --git a/doc/tutorials/online_analysis/lattice/online_analysis.ipynb b/doc/tutorials/online_analysis/lattice/online_analysis.ipynb index 412b63dd01..c5f58fa97b 100644 --- a/doc/tutorials/online_analysis/lattice/online_analysis.ipynb +++ b/doc/tutorials/online_analysis/lattice/online_analysis.ipynb @@ -378,6 +378,7 @@ }, { "cell_type": "code", + "id": "6f3ed63d-e324-443d-9b68-b2cf618d31c7", "execution_count": 7, "metadata": {}, "outputs": [ @@ -399,6 +400,7 @@ }, { "cell_type": "markdown", + "id": "96c154fe-5ca8-4d89-91f8-8fd4e75cb80e", "metadata": {}, "source": [ "We then apply the function `probe_points` to the `ux` and `uy` tensors computed in the last time step of the previous simulation. Note that all tensors are already on the DB, thus we can reference them by name. Finally, we download and plot the output (a 2D velocity field), which is stored as `probe_u` on the DB." @@ -406,6 +408,7 @@ }, { "cell_type": "code", + "id": "36e3b415-dcc1-4d25-9cce-52388146a4bb", "execution_count": 8, "metadata": {}, "outputs": [ @@ -432,6 +435,7 @@ }, { "cell_type": "markdown", + "id": "9d7e4966-a0de-480c-9556-936197a5a5d2", "metadata": {}, "source": [ "### Uploading a function inline\n", @@ -453,6 +457,7 @@ }, { "cell_type": "markdown", + "id": "1c4daf43-34d0-482a-b9b5-b3b6f1e173c4", "metadata": {}, "source": [ "We then store the function on the DB under the key `norm_function`." @@ -470,6 +475,7 @@ }, { "cell_type": "markdown", + "id": "19409ac6-e118-44db-a847-2d905fdf0331", "metadata": {}, "source": [ "Note that the key we used identifies a functional unit containing the function itself: this is similar to the key used to store the `probe` script above. When we want to run the function, we just call it with `run_script`, by indicating the `script` key as `\"norm_function\"` and the name of the function itself as `\"compute_norm\"`." diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index dcc5c8392b..2938746361 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -214,9 +214,12 @@ def group_infos(self) -> dict[str, ProcessGroupInfo]: def _initialize_hosts(self) -> None: with self._queue_lock: self._hosts: t.List[str] = sorted( - dragon_machine.Node(node).hostname - for node in dragon_machine.System().nodes + node for node in dragon_machine.System().nodes ) + self._nodes = [dragon_machine.Node(node) for node in self._hosts] + self._cpus = [node.num_cpus for node in self._nodes] + self._gpus = [node.num_gpus for node in self._nodes] + """List of hosts available in allocation""" self._free_hosts: t.Deque[str] = collections.deque(self._hosts) """List of hosts on which steps can be launched""" @@ -288,6 +291,34 @@ def current_time(self) -> float: """Current time for DragonBackend object, in seconds since the Epoch""" return time.time() + def _can_honor_policy( + self, request: DragonRunRequest + ) -> t.Tuple[bool, t.Optional[str]]: + """Check if the policy can be honored with resources available + in the allocation. + :param request: DragonRunRequest containing policy information + :returns: Tuple indicating if the policy can be honored and + an optional error message""" + # ensure the policy can be honored + if request.policy: + if request.policy.cpu_affinity: + # make sure some node has enough CPUs + available = max(self._cpus) + requested = max(request.policy.cpu_affinity) + + if requested >= available: + return False, "Cannot satisfy request, not enough CPUs available" + + if request.policy.gpu_affinity: + # make sure some node has enough GPUs + available = max(self._gpus) + requested = max(request.policy.gpu_affinity) + + if requested >= available: + return False, "Cannot satisfy request, not enough GPUs available" + + return True, None + def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str]]: """Check if request can be honored with resources available in the allocation. @@ -302,6 +333,11 @@ def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str] if self._shutdown_requested: message = "Cannot satisfy request, server is shutting down." return False, message + + honorable, err = self._can_honor_policy(request) + if not honorable: + return False, err + return True, None def _allocate_step( @@ -410,6 +446,46 @@ def infra_ddict(self) -> str: return str(self._infra_ddict.serialize()) + @staticmethod + def create_run_policy( + request: DragonRequest, node_name: str + ) -> "dragon_policy.Policy": + """Create a dragon Policy from the request and node name + :param request: DragonRunRequest containing policy information + :param node_name: Name of the node on which the process will run + :returns: dragon_policy.Policy object mapped from request properties""" + if isinstance(request, DragonRunRequest): + run_request: DragonRunRequest = request + + affinity = dragon_policy.Policy.Affinity.DEFAULT + cpu_affinity: t.List[int] = [] + gpu_affinity: t.List[int] = [] + + # Customize policy only if the client requested it, otherwise use default + if run_request.policy is not None: + # Affinities are not mutually exclusive. If specified, both are used + if run_request.policy.cpu_affinity: + affinity = dragon_policy.Policy.Affinity.SPECIFIC + cpu_affinity = run_request.policy.cpu_affinity + + if run_request.policy.gpu_affinity: + affinity = dragon_policy.Policy.Affinity.SPECIFIC + gpu_affinity = run_request.policy.gpu_affinity + + if affinity != dragon_policy.Policy.Affinity.DEFAULT: + return dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=node_name, + affinity=affinity, + cpu_affinity=cpu_affinity, + gpu_affinity=gpu_affinity, + ) + + return dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=node_name, + ) + def _start_steps(self) -> None: self._heartbeat() with self._queue_lock: @@ -432,10 +508,7 @@ def _start_steps(self) -> None: policies = [] for node_name in hosts: - local_policy = dragon_policy.Policy( - placement=dragon_policy.Policy.Placement.HOST_NAME, - host_name=node_name, - ) + local_policy = self.create_run_policy(request, node_name) policies.extend([local_policy] * request.tasks_per_node) tmp_proc = dragon_process.ProcessTemplate( target=request.exe, diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index 17b47e3090..9078fed54f 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -29,6 +29,8 @@ import os import typing as t +from smartsim._core.schemas.dragonRequests import DragonRunPolicy + from ...._core.launcher.stepMapping import StepMap from ....error import LauncherError, SmartSimError from ....log import get_logger @@ -168,6 +170,9 @@ def run(self, step: Step) -> t.Optional[str]: merged_env = self._connector.merge_persisted_env(os.environ.copy()) nodes = int(run_args.get("nodes", None) or 1) tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) + + policy = DragonRunPolicy.from_run_args(run_args) + response = _assert_schema_type( self._connector.send_request( DragonRunRequest( @@ -181,6 +186,7 @@ def run(self, step: Step) -> t.Optional[str]: current_env=merged_env, output_file=out, error_file=err, + policy=policy, ) ), DragonRunResponse, diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py index 036a9e5654..dd93d7910c 100644 --- a/smartsim/_core/launcher/step/dragonStep.py +++ b/smartsim/_core/launcher/step/dragonStep.py @@ -30,7 +30,11 @@ import sys import typing as t -from ...._core.schemas.dragonRequests import DragonRunRequest, request_registry +from ...._core.schemas.dragonRequests import ( + DragonRunPolicy, + DragonRunRequest, + request_registry, +) from ....error.errors import SSUnsupportedError from ....log import get_logger from ....settings import ( @@ -166,8 +170,11 @@ def _write_request_file(self) -> str: nodes = int(run_args.get("nodes", None) or 1) tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) + policy = DragonRunPolicy.from_run_args(run_args) + cmd = step.get_launch_cmd() out, err = step.get_output_files() + request = DragonRunRequest( exe=cmd[0], exe_args=cmd[1:], @@ -179,6 +186,7 @@ def _write_request_file(self) -> str: current_env=os.environ, output_file=out, error_file=err, + policy=policy, ) requests.append(request_registry.to_string(request)) with open(request_file, "w", encoding="utf-8") as script_file: diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index 2cce6e6107..171254e32a 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -26,6 +26,7 @@ from __future__ import annotations +import copy import functools import os.path as osp import pathlib @@ -51,7 +52,7 @@ def __init__(self, name: str, cwd: str, step_settings: SettingsBase) -> None: self.entity_name = name self.cwd = cwd self.managed = False - self.step_settings = step_settings + self.step_settings = copy.deepcopy(step_settings) self.meta: t.Dict[str, str] = {} @property diff --git a/smartsim/_core/schemas/dragonRequests.py b/smartsim/_core/schemas/dragonRequests.py index 3e384f746a..487ea915a0 100644 --- a/smartsim/_core/schemas/dragonRequests.py +++ b/smartsim/_core/schemas/dragonRequests.py @@ -26,9 +26,10 @@ import typing as t -from pydantic import BaseModel, Field, PositiveInt +from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt, ValidationError import smartsim._core.schemas.utils as _utils +from smartsim.error.errors import SmartSimError # Black and Pylint disagree about where to put the `...` # pylint: disable=multiple-statements @@ -39,6 +40,43 @@ class DragonRequest(BaseModel): ... +class DragonRunPolicy(BaseModel): + """Policy specifying hardware constraints when running a Dragon job""" + + cpu_affinity: t.List[NonNegativeInt] = Field(default_factory=list) + """List of CPU indices to which the job should be pinned""" + gpu_affinity: t.List[NonNegativeInt] = Field(default_factory=list) + """List of GPU indices to which the job should be pinned""" + + @staticmethod + def from_run_args( + run_args: t.Dict[str, t.Union[int, str, float, None]] + ) -> "DragonRunPolicy": + """Create a DragonRunPolicy with hardware constraints passed from + a dictionary of run arguments + :param run_args: Dictionary of run arguments + :returns: DragonRunPolicy instance created from the run arguments""" + gpu_args = "" + if gpu_arg_value := run_args.get("gpu-affinity", None): + gpu_args = str(gpu_arg_value) + + cpu_args = "" + if cpu_arg_value := run_args.get("cpu-affinity", None): + cpu_args = str(cpu_arg_value) + + # run args converted to a string must be split back into a list[int] + gpu_affinity = [int(x.strip()) for x in gpu_args.split(",") if x] + cpu_affinity = [int(x.strip()) for x in cpu_args.split(",") if x] + + try: + return DragonRunPolicy( + cpu_affinity=cpu_affinity, + gpu_affinity=gpu_affinity, + ) + except ValidationError as ex: + raise SmartSimError("Unable to build DragonRunPolicy") from ex + + class DragonRunRequestView(DragonRequest): exe: t.Annotated[str, Field(min_length=1)] exe_args: t.List[t.Annotated[str, Field(min_length=1)]] = [] @@ -57,6 +95,7 @@ class DragonRunRequestView(DragonRequest): @request_registry.register("run") class DragonRunRequest(DragonRunRequestView): current_env: t.Dict[str, t.Optional[str]] = {} + policy: t.Optional[DragonRunPolicy] = None def __str__(self) -> str: return str(DragonRunRequestView.parse_obj(self.dict(exclude={"current_env"}))) diff --git a/smartsim/settings/dragonRunSettings.py b/smartsim/settings/dragonRunSettings.py index b8baa4708c..69a91547e7 100644 --- a/smartsim/settings/dragonRunSettings.py +++ b/smartsim/settings/dragonRunSettings.py @@ -28,6 +28,8 @@ import typing as t +from typing_extensions import override + from ..log import get_logger from .base import RunSettings @@ -63,6 +65,7 @@ def __init__( **kwargs, ) + @override def set_nodes(self, nodes: int) -> None: """Set the number of nodes @@ -70,9 +73,38 @@ def set_nodes(self, nodes: int) -> None: """ self.run_args["nodes"] = nodes + @override def set_tasks_per_node(self, tasks_per_node: int) -> None: """Set the number of tasks for this job :param tasks_per_node: number of tasks per node """ self.run_args["tasks-per-node"] = tasks_per_node + + @override + def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None: + """Specify the node feature for this job + + :param feature_list: a collection of strings representing the required + node features. Currently supported node features are: "gpu" + """ + if isinstance(feature_list, str): + feature_list = feature_list.strip().split() + elif not all(isinstance(feature, str) for feature in feature_list): + raise TypeError("feature_list must be string or list of strings") + + self.run_args["node-feature"] = ",".join(feature_list) + + def set_cpu_affinity(self, devices: t.List[int]) -> None: + """Set the CPU affinity for this job + + :param devices: list of CPU indices to execute on + """ + self.run_args["cpu-affinity"] = ",".join(str(device) for device in devices) + + def set_gpu_affinity(self, devices: t.List[int]) -> None: + """Set the GPU affinity for this job + + :param devices: list of GPU indices to execute on. + """ + self.run_args["gpu-affinity"] = ",".join(str(device) for device in devices) diff --git a/tests/test_dragon_client.py b/tests/test_dragon_client.py new file mode 100644 index 0000000000..80257b6107 --- /dev/null +++ b/tests/test_dragon_client.py @@ -0,0 +1,192 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os +import pathlib +import typing as t +from unittest.mock import MagicMock + +import pytest + +from smartsim._core.launcher.step.dragonStep import DragonBatchStep, DragonStep +from smartsim.settings import DragonRunSettings +from smartsim.settings.slurmSettings import SbatchSettings + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +import smartsim._core.entrypoints.dragon_client as dragon_client +from smartsim._core.schemas.dragonRequests import * +from smartsim._core.schemas.dragonResponses import * + + +@pytest.fixture +def dragon_batch_step(test_dir: str) -> "DragonBatchStep": + """Fixture for creating a default batch of steps for a dragon launcher""" + test_path = pathlib.Path(test_dir) + + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + # ensure the status_dir is set + status_dir = (test_path / ".smartsim" / "logs").as_posix() + batch_step.meta["status_dir"] = status_dir + + # create some steps to verify the requests file output changes + rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) + rs1 = DragonRunSettings(exe="sleep", exe_args=["2"]) + rs2 = DragonRunSettings(exe="sleep", exe_args=["3"]) + rs3 = DragonRunSettings(exe="sleep", exe_args=["4"]) + + names = "test00", "test01", "test02", "test03" + settings = rs0, rs1, rs2, rs3 + + # create steps with: + # no affinity, cpu affinity only, gpu affinity only, cpu and gpu affinity + cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] + gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] + + # assign some unique affinities to each run setting instance + for index, rs in enumerate(settings): + if gpu_affinities[index]: + rs.set_node_feature("gpu") + rs.set_cpu_affinity(cpu_affinities[index]) + rs.set_gpu_affinity(gpu_affinities[index]) + + steps = list( + DragonStep(name_, test_dir, rs_) for name_, rs_ in zip(names, settings) + ) + + for index, step in enumerate(steps): + # ensure meta is configured... + step.meta["status_dir"] = status_dir + # ... and put all the steps into the batch + batch_step.add_to_batch(steps[index]) + + return batch_step + + +def get_request_path_from_batch_script(launch_cmd: t.List[str]) -> pathlib.Path: + """Helper method for finding the path to a request file from the launch command""" + script_path = pathlib.Path(launch_cmd[-1]) + batch_script = script_path.read_text(encoding="utf-8") + batch_statements = [line for line in batch_script.split("\n") if line] + entrypoint_cmd = batch_statements[-1] + requests_file = pathlib.Path(entrypoint_cmd.split()[-1]) + return requests_file + + +def test_dragon_client_main_no_arg(monkeypatch: pytest.MonkeyPatch): + """Verify the client fails when the path to a submission file is not provided.""" + with pytest.raises(SystemExit): + dragon_client.cleanup = MagicMock() + dragon_client.main([]) + + # arg parser failures occur before resource allocation and should + # not result in resource cleanup being called + assert not dragon_client.cleanup.called + + +def test_dragon_client_main_empty_arg(test_dir: str): + """Verify the client fails when the path to a submission file is empty.""" + + with pytest.raises(ValueError) as ex: + dragon_client.cleanup = MagicMock() + dragon_client.main(["+submit", ""]) + + # verify it's a value error related to submit argument + assert "file not provided" in ex.value.args[0] + + # arg parser failures occur before resource allocation and should + # not result in resource cleanup being called + assert not dragon_client.cleanup.called + + +def test_dragon_client_main_bad_arg(test_dir: str): + """Verify the client returns a failure code when the path to a submission file is + invalid and does not raise an exception""" + path = pathlib.Path(test_dir) / "nonexistent_file.json" + + dragon_client.cleanup = MagicMock() + return_code = dragon_client.main(["+submit", str(path)]) + + # ensure non-zero return code + assert return_code != 0 + + # ensure failures do not block resource cleanup + assert dragon_client.cleanup.called + + +def test_dragon_client_main( + dragon_batch_step: DragonBatchStep, monkeypatch: pytest.MonkeyPatch +): + """Verify the client returns a failure code when the path to a submission file is + invalid and does not raise an exception""" + launch_cmd = dragon_batch_step.get_launch_cmd() + path = get_request_path_from_batch_script(launch_cmd) + num_requests_in_batch = 4 + num_shutdown_requests = 1 + request_count = num_requests_in_batch + num_shutdown_requests + submit_value = str(path) + + mock_connector = MagicMock() # DragonConnector + mock_connector.is_connected = True + mock_connector.send_request.return_value = DragonRunResponse(step_id="mock_step_id") + # mock can_monitor to exit before the infinite loop checking for shutdown + mock_connector.can_monitor = False + + mock_connector_class = MagicMock() + mock_connector_class.return_value = mock_connector + + # with monkeypatch.context() as ctx: + dragon_client.DragonConnector = mock_connector_class + dragon_client.cleanup = MagicMock() + + return_code = dragon_client.main(["+submit", submit_value]) + + # verify each request in the request file was processed + assert mock_connector.send_request.call_count == request_count + + # we know the batch fixture has a step with no affinity args supplied. skip it + for i in range(1, num_requests_in_batch): + sent_args = mock_connector.send_request.call_args_list[i][0] + request_arg = sent_args[0] + + assert isinstance(request_arg, DragonRunRequest) + + policy = request_arg.policy + + # make sure each policy has been read in correctly with valid affinity indices + assert len(policy.cpu_affinity) == len(set(policy.cpu_affinity)) + assert len(policy.gpu_affinity) == len(set(policy.gpu_affinity)) + + # we get a non-zero due to avoiding the infinite loop. consider refactoring + assert return_code == os.EX_IOERR + + # ensure failures do not block resource cleanup + assert dragon_client.cleanup.called diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index ee0fcb14b7..4fe8bf71b4 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -31,6 +31,7 @@ import sys import time import typing as t +from unittest.mock import MagicMock import pytest import zmq @@ -38,15 +39,74 @@ import smartsim._core.config from smartsim._core._cli.scripts.dragon_install import create_dotenv from smartsim._core.config.config import get_config -from smartsim._core.launcher.dragon.dragonLauncher import DragonConnector +from smartsim._core.launcher.dragon.dragonLauncher import ( + DragonConnector, + DragonLauncher, +) from smartsim._core.launcher.dragon.dragonSockets import ( get_authenticator, get_secure_socket, ) +from smartsim._core.launcher.step.dragonStep import DragonBatchStep, DragonStep from smartsim._core.schemas.dragonRequests import DragonBootstrapRequest -from smartsim._core.schemas.dragonResponses import DragonHandshakeResponse +from smartsim._core.schemas.dragonResponses import ( + DragonHandshakeResponse, + DragonRunResponse, +) from smartsim._core.utils.network import IFConfig, find_free_port from smartsim._core.utils.security import KeyManager +from smartsim.error.errors import LauncherError +from smartsim.settings.dragonRunSettings import DragonRunSettings +from smartsim.settings.slurmSettings import SbatchSettings + + +@pytest.fixture +def dragon_batch_step(test_dir: str) -> DragonBatchStep: + """Fixture for creating a default batch of steps for a dragon launcher""" + test_path = pathlib.Path(test_dir) + + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + # ensure the status_dir is set + status_dir = (test_path / ".smartsim" / "logs").as_posix() + batch_step.meta["status_dir"] = status_dir + + # create some steps to verify the requests file output changes + rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) + rs1 = DragonRunSettings(exe="sleep", exe_args=["2"]) + rs2 = DragonRunSettings(exe="sleep", exe_args=["3"]) + rs3 = DragonRunSettings(exe="sleep", exe_args=["4"]) + + names = "test00", "test01", "test02", "test03" + settings = rs0, rs1, rs2, rs3 + + # create steps with: + # no affinity, cpu affinity only, gpu affinity only, cpu and gpu affinity + cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] + gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] + + # assign some unique affinities to each run setting instance + for index, rs in enumerate(settings): + if gpu_affinities[index]: + rs.set_node_feature("gpu") + rs.set_cpu_affinity(cpu_affinities[index]) + rs.set_gpu_affinity(gpu_affinities[index]) + + steps = list( + DragonStep(name_, test_dir, rs_) for name_, rs_ in zip(names, settings) + ) + + for index, step in enumerate(steps): + # ensure meta is configured... + step.meta["status_dir"] = status_dir + # ... and put all the steps into the batch + batch_step.add_to_batch(steps[index]) + + return batch_step + # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -521,3 +581,162 @@ def test_merge_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): # any non-dragon keys that didn't exist avoid unnecessary prepending assert merged_env[non_dragon_key] == non_dragon_value + + +def test_run_step_fail(test_dir: str) -> None: + """Verify that the dragon launcher still returns the step id + when the running step fails""" + test_path = pathlib.Path(test_dir) + status_dir = (test_path / ".smartsim" / "logs").as_posix() + + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + step0 = DragonStep("step0", test_dir, rs) + step0.meta["status_dir"] = status_dir + + mock_connector = MagicMock() # DragonConnector() + mock_connector.is_connected = True + mock_connector.send_request = MagicMock( + return_value=DragonRunResponse(step_id=step0.name, error_message="mock fail!") + ) + + launcher = DragonLauncher() + launcher._connector = mock_connector + + result = launcher.run(step0) + + # verify the failed step name is in the result + assert step0.name in result + + +def test_run_step_batch_empty(dragon_batch_step: DragonBatchStep) -> None: + """Verify that the dragon launcher behaves when asked to execute + a batch step that has no sub-steps""" + # remove the steps added in the batch fixture + dragon_batch_step.steps.clear() + + mock_step_id = "MOCK-STEPID" + mock_connector = MagicMock() # DragonConnector() + mock_connector.is_connected = True + mock_connector.send_request = MagicMock( + return_value=DragonRunResponse( + step_id=dragon_batch_step.name, error_message="mock fail!" + ) + ) + + launcher = DragonLauncher() + launcher._connector = mock_connector + launcher.task_manager.start_and_wait = MagicMock(return_value=(0, mock_step_id, "")) + + result = launcher.run(dragon_batch_step) + + # verify a step name is returned + assert result + # verify the batch step name is not in the result (renamed to SLURM-*) + assert dragon_batch_step.name not in result + + send_invocation = mock_connector.send_request + + # verify a batch request is not sent through the dragon connector + send_invocation.assert_not_called() + + +def test_run_step_batch_failure(dragon_batch_step: DragonBatchStep) -> None: + """Verify that the dragon launcher sends returns the step id + when the running step fails""" + mock_connector = MagicMock() # DragonConnector() + mock_connector.is_connected = True + mock_connector.send_request = MagicMock( + return_value=DragonRunResponse( + step_id=dragon_batch_step.name, error_message="mock fail!" + ) + ) + + mock_step_id = "MOCK-STEPID" + error_msg = "DOES_NOT_COMPUTE!" + launcher = DragonLauncher() + launcher._connector = mock_connector + launcher.task_manager.start_and_wait = MagicMock( + return_value=(1, mock_step_id, error_msg) + ) + + # a non-zero return code from the batch script should raise an error + with pytest.raises(LauncherError) as ex: + launcher.run(dragon_batch_step) + + # verify the correct error message is in the exception + assert error_msg in ex.value.args[0] + + +def test_run_step_success(test_dir: str) -> None: + """Verify that the dragon launcher sends the correctly formatted request for a step""" + test_path = pathlib.Path(test_dir) + status_dir = (test_path / ".smartsim" / "logs").as_posix() + + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + step0 = DragonStep("step0", test_dir, rs) + step0.meta["status_dir"] = status_dir + + mock_connector = MagicMock() # DragonConnector() + mock_connector.is_connected = True + mock_connector.send_request = MagicMock( + return_value=DragonRunResponse(step_id=step0.name) + ) + + launcher = DragonLauncher() + launcher._connector = mock_connector + + result = launcher.run(step0) + + # verify the successfully executed step name is in the result + assert step0.name in result + + # verify the DragonRunRequest sent matches all expectations + send_invocation = mock_connector.send_request + send_invocation.assert_called_once() + + args = send_invocation.call_args[0] # call_args == t.Tuple[args, kwargs] + + dragon_run_request = args[0] + req_name = dragon_run_request.name # name sent to dragon env + assert req_name.startswith(step0.name) + + req_policy_cpu_affinity = dragon_run_request.policy.cpu_affinity + assert not req_policy_cpu_affinity # default should be empty list + + req_policy_gpu_affinity = dragon_run_request.policy.gpu_affinity + assert not req_policy_gpu_affinity # default should be empty list + + +def test_run_step_success_batch( + monkeypatch: pytest.MonkeyPatch, dragon_batch_step: DragonBatchStep +) -> None: + """Verify that the dragon launcher sends the correctly formatted request + for a batch step""" + mock_connector = MagicMock() # DragonConnector() + mock_connector.is_connected = True + mock_connector.send_request = MagicMock( + return_value=DragonRunResponse(step_id=dragon_batch_step.name) + ) + + launcher = DragonLauncher() + launcher._connector = mock_connector + launcher.task_manager.start_and_wait = MagicMock(return_value=(0, "success", "")) + + result = launcher.run(dragon_batch_step) + + # verify the successfully executed step name is in the result + assert dragon_batch_step.name not in result + assert result + + send_invocation = mock_connector.send_request + + # verify a batch request is not sent through the dragon connector + send_invocation.assert_not_called() + launcher.task_manager.start_and_wait.assert_called_once() + + args = launcher.task_manager.start_and_wait.call_args[0] + + # verify the batch script is executed + launch_cmd = dragon_batch_step.get_launch_cmd() + for stmt in launch_cmd: + assert stmt in args[0] # args[0] is the cmd list sent to subprocess.Popen diff --git a/tests/test_dragon_run_policy.py b/tests/test_dragon_run_policy.py new file mode 100644 index 0000000000..1d8d069fab --- /dev/null +++ b/tests/test_dragon_run_policy.py @@ -0,0 +1,371 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib + +import pytest + +from smartsim._core.launcher.step.dragonStep import DragonBatchStep, DragonStep +from smartsim.settings.dragonRunSettings import DragonRunSettings +from smartsim.settings.slurmSettings import SbatchSettings + +try: + from dragon.infrastructure.policy import Policy + + import smartsim._core.entrypoints.dragon as drg + from smartsim._core.launcher.dragon.dragonBackend import DragonBackend + + dragon_loaded = True +except: + dragon_loaded = False + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + +from smartsim._core.schemas.dragonRequests import * +from smartsim._core.schemas.dragonResponses import * + + +@pytest.fixture +def dragon_batch_step(test_dir: str) -> "DragonBatchStep": + """Fixture for creating a default batch of steps for a dragon launcher""" + test_path = pathlib.Path(test_dir) + + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + # ensure the status_dir is set + status_dir = (test_path / ".smartsim" / "logs").as_posix() + batch_step.meta["status_dir"] = status_dir + + # create some steps to verify the requests file output changes + rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) + rs1 = DragonRunSettings(exe="sleep", exe_args=["2"]) + rs2 = DragonRunSettings(exe="sleep", exe_args=["3"]) + rs3 = DragonRunSettings(exe="sleep", exe_args=["4"]) + + names = "test00", "test01", "test02", "test03" + settings = rs0, rs1, rs2, rs3 + + # create steps with: + # no affinity, cpu affinity only, gpu affinity only, cpu and gpu affinity + cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] + gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] + + # assign some unique affinities to each run setting instance + for index, rs in enumerate(settings): + if gpu_affinities[index]: + rs.set_node_feature("gpu") + rs.set_cpu_affinity(cpu_affinities[index]) + rs.set_gpu_affinity(gpu_affinities[index]) + + steps = list( + DragonStep(name_, test_dir, rs_) for name_, rs_ in zip(names, settings) + ) + + for index, step in enumerate(steps): + # ensure meta is configured... + step.meta["status_dir"] = status_dir + # ... and put all the steps into the batch + batch_step.add_to_batch(steps[index]) + + return batch_step + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +@pytest.mark.parametrize( + "dragon_request", + [ + pytest.param(DragonHandshakeRequest(), id="DragonHandshakeRequest"), + pytest.param(DragonShutdownRequest(), id="DragonShutdownRequest"), + pytest.param( + DragonBootstrapRequest(address="localhost"), id="DragonBootstrapRequest" + ), + ], +) +def test_create_run_policy_non_run_request(dragon_request: DragonRequest) -> None: + """Verify that a default policy is returned when a request is + not attempting to start a new proccess (e.g. a DragonRunRequest)""" + policy = DragonBackend.create_run_policy(dragon_request, "localhost") + + assert policy is not None, "Default policy was not returned" + assert ( + policy.device == Policy.Device.DEFAULT + ), "Default device was not Device.DEFAULT" + assert policy.cpu_affinity == [], "Default cpu affinity was not empty" + assert policy.gpu_affinity == [], "Default gpu affinity was not empty" + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_create_run_policy_run_request_no_run_policy() -> None: + """Verify that a policy specifying no policy is returned with all default + values (no device, empty cpu & gpu affinity)""" + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + # policy= # <--- skipping this + ) + + policy = DragonBackend.create_run_policy(run_req, "localhost") + + assert policy.device == Policy.Device.DEFAULT + assert set(policy.cpu_affinity) == set() + assert policy.gpu_affinity == [] + assert policy.affinity == Policy.Affinity.DEFAULT + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_create_run_policy_run_request_default_run_policy() -> None: + """Verify that a policy specifying no affinity is returned with + default value for device and empty affinity lists""" + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(), # <--- passing default values + ) + + policy = DragonBackend.create_run_policy(run_req, "localhost") + + assert set(policy.cpu_affinity) == set() + assert set(policy.gpu_affinity) == set() + assert policy.affinity == Policy.Affinity.DEFAULT + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_create_run_policy_run_request_cpu_affinity_no_device() -> None: + """Verify that a input policy specifying a CPU affinity but lacking the device field + produces a Dragon Policy with the CPU device specified""" + affinity = set([0, 2, 4]) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(cpu_affinity=list(affinity)), # <-- no device spec + ) + + policy = DragonBackend.create_run_policy(run_req, "localhost") + + assert set(policy.cpu_affinity) == affinity + assert policy.gpu_affinity == [] + assert policy.affinity == Policy.Affinity.SPECIFIC + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_create_run_policy_run_request_cpu_affinity() -> None: + """Verify that a policy specifying CPU affinity is returned as expected""" + affinity = set([0, 2, 4]) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(cpu_affinity=list(affinity)), + ) + + policy = DragonBackend.create_run_policy(run_req, "localhost") + + assert set(policy.cpu_affinity) == affinity + assert policy.gpu_affinity == [] + assert policy.affinity == Policy.Affinity.SPECIFIC + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_create_run_policy_run_request_gpu_affinity() -> None: + """Verify that a policy specifying GPU affinity is returned as expected""" + affinity = set([0, 2, 4]) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(device="gpu", gpu_affinity=list(affinity)), + ) + + policy = DragonBackend.create_run_policy(run_req, "localhost") + + assert policy.cpu_affinity == [] + assert set(policy.gpu_affinity) == set(affinity) + assert policy.affinity == Policy.Affinity.SPECIFIC + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_dragon_run_policy_from_run_args() -> None: + """Verify that a DragonRunPolicy is created from a dictionary of run arguments""" + run_args = { + "gpu-affinity": "0,1,2", + "cpu-affinity": "3,4,5,6", + } + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [3, 4, 5, 6] + assert policy.gpu_affinity == [0, 1, 2] + + +def test_dragon_run_policy_from_run_args_empty() -> None: + """Verify that a DragonRunPolicy is created from an empty + dictionary of run arguments""" + run_args = {} + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [] + assert policy.gpu_affinity == [] + + +def test_dragon_run_policy_from_run_args_cpu_affinity() -> None: + """Verify that a DragonRunPolicy is created from a dictionary + of run arguments containing a CPU affinity""" + run_args = { + "cpu-affinity": "3,4,5,6", + } + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [3, 4, 5, 6] + assert policy.gpu_affinity == [] + + +def test_dragon_run_policy_from_run_args_gpu_affinity() -> None: + """Verify that a DragonRunPolicy is created from a dictionary + of run arguments containing a GPU affinity""" + run_args = { + "gpu-affinity": "0, 1, 2", + } + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [] + assert policy.gpu_affinity == [0, 1, 2] + + +def test_dragon_run_policy_from_run_args_invalid_gpu_affinity() -> None: + """Verify that a DragonRunPolicy is NOT created from a dictionary + of run arguments with an invalid GPU affinity""" + run_args = { + "gpu-affinity": "0,-1,2", + } + + with pytest.raises(SmartSimError) as ex: + DragonRunPolicy.from_run_args(run_args) + + assert "DragonRunPolicy" in ex.value.args[0] + + +def test_dragon_run_policy_from_run_args_invalid_cpu_affinity() -> None: + """Verify that a DragonRunPolicy is NOT created from a dictionary + of run arguments with an invalid CPU affinity""" + run_args = { + "cpu-affinity": "3,4,5,-6", + } + + with pytest.raises(SmartSimError) as ex: + DragonRunPolicy.from_run_args(run_args) + + assert "DragonRunPolicy" in ex.value.args[0] + + +def test_dragon_run_policy_from_run_args_ignore_empties_gpu() -> None: + """Verify that a DragonRunPolicy is created from a dictionary + of run arguments and ignores empty values in the serialized gpu list""" + run_args = { + "gpu-affinity": "0,,2", + } + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [] + assert policy.gpu_affinity == [0, 2] + + +def test_dragon_run_policy_from_run_args_ignore_empties_cpu() -> None: + """Verify that a DragonRunPolicy is created from a dictionary + of run arguments and ignores empty values in the serialized cpu list""" + run_args = { + "cpu-affinity": "3,4,,6,", + } + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [3, 4, 6] + assert policy.gpu_affinity == [] + + +def test_dragon_run_policy_from_run_args_null_gpu_affinity() -> None: + """Verify that a DragonRunPolicy is created if a null value is encountered + in the gpu-affinity list""" + run_args = { + "gpu-affinity": None, + "cpu-affinity": "3,4,5,6", + } + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [3, 4, 5, 6] + assert policy.gpu_affinity == [] + + +def test_dragon_run_policy_from_run_args_null_cpu_affinity() -> None: + """Verify that a DragonRunPolicy is created if a null value is encountered + in the cpu-affinity list""" + run_args = {"gpu-affinity": "0,1,2", "cpu-affinity": None} + + policy = DragonRunPolicy.from_run_args(run_args) + + assert policy.cpu_affinity == [] + assert policy.gpu_affinity == [0, 1, 2] diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_run_request.py similarity index 64% rename from tests/test_dragon_backend.py rename to tests/test_dragon_run_request.py index f284f38d99..94c17c222a 100644 --- a/tests/test_dragon_backend.py +++ b/tests/test_dragon_run_request.py @@ -31,19 +31,17 @@ from unittest.mock import MagicMock import pytest +from pydantic import ValidationError # The tests in this file belong to the group_b group -pytestmark = pytest.mark.group_a +pytestmark = pytest.mark.group_b try: import dragon -except ImportError: - pass -else: - pytest.skip( - reason="Using dragon as launcher, not running Dragon unit tests", - allow_module_level=True, - ) + + dragon_loaded = True +except: + dragon_loaded = False from smartsim._core.config import CONFIG from smartsim._core.schemas.dragonRequests import * @@ -59,10 +57,36 @@ class NodeMock(MagicMock): + def __init__( + self, name: t.Optional[str] = None, num_gpus: int = 2, num_cpus: int = 8 + ) -> None: + super().__init__() + self._mock_id = name + NodeMock._num_gpus = num_gpus + NodeMock._num_cpus = num_cpus + @property def hostname(self) -> str: + if self._mock_id: + return self._mock_id return create_short_id_str() + @property + def num_cpus(self) -> str: + return NodeMock._num_cpus + + @property + def num_gpus(self) -> str: + return NodeMock._num_gpus + + def _set_id(self, value: str) -> None: + self._mock_id = value + + def gpus(self, parent: t.Any = None) -> t.List[str]: + if self._num_gpus: + return [f"{self.hostname}-gpu{i}" for i in range(NodeMock._num_gpus)] + return [] + class GroupStateMock(MagicMock): def Running(self) -> MagicMock: @@ -78,13 +102,19 @@ class ProcessGroupMock(MagicMock): puids = [121, 122] -def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": +def node_mock() -> NodeMock: + return NodeMock() + + +def get_mock_backend( + monkeypatch: pytest.MonkeyPatch, num_gpus: int = 2 +) -> "DragonBackend": process_mock = MagicMock(returncode=0) process_group_mock = MagicMock(**{"Process.return_value": ProcessGroupMock()}) process_module_mock = MagicMock() process_module_mock.Process = process_mock - node_mock = NodeMock() + node_mock = NodeMock(num_gpus=num_gpus) system_mock = MagicMock(nodes=["node1", "node2", "node3"]) monkeypatch.setitem( sys.modules, @@ -199,6 +229,7 @@ def set_mock_group_infos( return group_infos +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_handshake_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) @@ -209,6 +240,7 @@ def test_handshake_request(monkeypatch: pytest.MonkeyPatch) -> None: assert handshake_resp.dragon_pid == 99999 +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) run_req = DragonRunRequest( @@ -259,6 +291,7 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: assert not dragon_backend._running_steps +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_deny_run_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) @@ -284,6 +317,78 @@ def test_deny_run_request(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend.group_infos[step_id].status == SmartSimStatus.STATUS_FAILED +def test_run_request_with_empty_policy(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that a policy is applied to a run request""" + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=None, + ) + assert run_req.policy is None + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that a policy is applied to a run request""" + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(cpu_affinity=[0, 1]), + ) + + run_resp = dragon_backend.process_request(run_req) + assert isinstance(run_resp, DragonRunResponse) + + step_id = run_resp.step_id + assert dragon_backend._queued_steps[step_id] == run_req + + mock_process_group = MagicMock(puids=[123, 124]) + + dragon_backend._group_infos[step_id].process_group = mock_process_group + dragon_backend._group_infos[step_id].puids = [123, 124] + dragon_backend._start_steps() + + assert dragon_backend._running_steps == [step_id] + assert len(dragon_backend._queued_steps) == 0 + assert len(dragon_backend._free_hosts) == 1 + assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id + assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id + + monkeypatch.setattr( + dragon_backend._group_infos[step_id].process_group, "status", "Running" + ) + + dragon_backend._update() + + assert dragon_backend._running_steps == [step_id] + assert len(dragon_backend._queued_steps) == 0 + assert len(dragon_backend._free_hosts) == 1 + assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id + assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id + + dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED + + dragon_backend._update() + assert not dragon_backend._running_steps + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_udpate_status_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) @@ -300,6 +405,7 @@ def test_udpate_status_request(monkeypatch: pytest.MonkeyPatch) -> None: } +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_stop_request(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) group_infos = set_mock_group_infos(monkeypatch, dragon_backend) @@ -331,6 +437,7 @@ def test_stop_request(monkeypatch: pytest.MonkeyPatch) -> None: assert len(dragon_backend._free_hosts) == 3 +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize( "immediate, kill_jobs, frontend_shutdown", [ @@ -389,6 +496,7 @@ def test_shutdown_request( assert dragon_backend._has_cooled_down == kill_jobs +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize("telemetry_flag", ["0", "1"]) def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) -> None: monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", telemetry_flag) @@ -404,6 +512,7 @@ def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) - assert dragon_backend.cooldown_period == expected_cooldown +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_heartbeat_and_time(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) first_heartbeat = dragon_backend.last_heartbeat @@ -412,6 +521,7 @@ def test_heartbeat_and_time(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend.last_heartbeat > first_heartbeat +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize("num_nodes", [1, 3, 100]) def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None: dragon_backend = get_mock_backend(monkeypatch) @@ -432,6 +542,119 @@ def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None: ) +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +@pytest.mark.parametrize("affinity", [[0], [0, 1], list(range(8))]) +def test_can_honor_cpu_affinity( + monkeypatch: pytest.MonkeyPatch, affinity: t.List[int] +) -> None: + """Verify that valid CPU affinities are accepted""" + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(cpu_affinity=affinity), + ) + + assert dragon_backend._can_honor(run_req)[0] + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_can_honor_cpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that invalid CPU affinities are NOT accepted + NOTE: negative values are captured by the Pydantic schema""" + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(cpu_affinity=list(range(9))), + ) + + assert not dragon_backend._can_honor(run_req)[0] + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +@pytest.mark.parametrize("affinity", [[0], [0, 1]]) +def test_can_honor_gpu_affinity( + monkeypatch: pytest.MonkeyPatch, affinity: t.List[int] +) -> None: + """Verify that valid GPU affinities are accepted""" + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(gpu_affinity=affinity), + ) + + assert dragon_backend._can_honor(run_req)[0] + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_can_honor_gpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that invalid GPU affinities are NOT accepted + NOTE: negative values are captured by the Pydantic schema""" + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(gpu_affinity=list(range(3))), + ) + + assert not dragon_backend._can_honor(run_req)[0] + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") +def test_can_honor_gpu_device_not_available(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that a request for a GPU if none exists is not accepted""" + + # create a mock node class that always reports no GPUs available + dragon_backend = get_mock_backend(monkeypatch, num_gpus=0) + + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + # specify GPU device w/no affinity + policy=DragonRunPolicy(gpu_affinity=[0]), + ) + + assert not dragon_backend._can_honor(run_req)[0] + + +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_get_id(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) step_id = next(dragon_backend._step_ids) @@ -440,6 +663,7 @@ def test_get_id(monkeypatch: pytest.MonkeyPatch) -> None: assert step_id != next(dragon_backend._step_ids) +@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_view(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) set_mock_group_infos(monkeypatch, dragon_backend) @@ -447,17 +671,21 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None: expected_message = textwrap.dedent(f"""\ Dragon server backend update - | Host | Status | - |---------|----------| + | Host | Status | + |--------|----------| | {hosts[0]} | Busy | | {hosts[1]} | Free | | {hosts[2]} | Free | | Step | Status | Hosts | Return codes | Num procs | - |----------|--------------|-----------------|----------------|-------------| + |----------|--------------|-------------|----------------|-------------| | abc123-1 | Running | {hosts[0]} | | 1 | | del999-2 | Cancelled | {hosts[1]} | -9 | 1 | | c101vz-3 | Completed | {hosts[1]},{hosts[2]} | 0 | 2 | | 0ghjk1-4 | Failed | {hosts[2]} | -1 | 1 | | ljace0-5 | NeverStarted | | | 0 |""") - assert dragon_backend.status_message == expected_message + # get rid of white space to make the comparison easier + actual_msg = dragon_backend.status_message.replace(" ", "") + expected_message = expected_message.replace(" ", "") + + assert actual_msg == expected_message diff --git a/tests/test_dragon_run_request_nowlm.py b/tests/test_dragon_run_request_nowlm.py new file mode 100644 index 0000000000..afd25aa9d7 --- /dev/null +++ b/tests/test_dragon_run_request_nowlm.py @@ -0,0 +1,105 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest +from pydantic import ValidationError + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +from smartsim._core.schemas.dragonRequests import * +from smartsim._core.schemas.dragonResponses import * + + +def test_run_request_with_null_policy(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that an empty policy does not cause an error""" + # dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=None, + ) + assert run_req.policy is None + + +def test_run_request_with_empty_policy(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that a non-empty policy is set correctly""" + # dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(), + ) + assert run_req.policy is not None + assert not run_req.policy.cpu_affinity + assert not run_req.policy.gpu_affinity + + +@pytest.mark.parametrize( + "device,cpu_affinity,gpu_affinity", + [ + pytest.param("cpu", [-1], [], id="cpu_affinity"), + pytest.param("gpu", [], [-1], id="gpu_affinity"), + ], +) +def test_run_request_with_negative_affinity( + device: str, + cpu_affinity: t.List[int], + gpu_affinity: t.List[int], +) -> None: + """Verify that invalid affinity values fail validation""" + with pytest.raises(ValidationError) as ex: + DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy( + cpu_affinity=cpu_affinity, gpu_affinity=gpu_affinity + ), + ) + + assert f"{device}_affinity" in str(ex.value.args[0]) + assert "NumberNotGeError" in str(ex.value.args[0]) diff --git a/tests/test_dragon_runsettings.py b/tests/test_dragon_runsettings.py new file mode 100644 index 0000000000..34e8510e82 --- /dev/null +++ b/tests/test_dragon_runsettings.py @@ -0,0 +1,98 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from smartsim.settings import DragonRunSettings + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_a + + +def test_dragon_runsettings_nodes(): + """Verify that node count is set correctly""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + exp_value = 3 + rs.set_nodes(exp_value) + assert rs.run_args["nodes"] == exp_value + + exp_value = 9 + rs.set_nodes(exp_value) + assert rs.run_args["nodes"] == exp_value + + +def test_dragon_runsettings_tasks_per_node(): + """Verify that tasks per node is set correctly""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + exp_value = 3 + rs.set_tasks_per_node(exp_value) + assert rs.run_args["tasks-per-node"] == exp_value + + exp_value = 7 + rs.set_tasks_per_node(exp_value) + assert rs.run_args["tasks-per-node"] == exp_value + + +def test_dragon_runsettings_cpu_affinity(): + """Verify that the CPU affinity is set correctly""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + exp_value = [0, 1, 2, 3] + rs.set_cpu_affinity([0, 1, 2, 3]) + assert rs.run_args["cpu-affinity"] == ",".join(str(val) for val in exp_value) + + # ensure the value is not changed when we extend the list + exp_value.extend([4, 5, 6]) + assert rs.run_args["cpu-affinity"] != ",".join(str(val) for val in exp_value) + + rs.set_cpu_affinity(exp_value) + assert rs.run_args["cpu-affinity"] == ",".join(str(val) for val in exp_value) + + # ensure the value is not changed when we extend the list + rs.run_args["cpu-affinity"] = "7,8,9" + assert rs.run_args["cpu-affinity"] != ",".join(str(val) for val in exp_value) + + +def test_dragon_runsettings_gpu_affinity(): + """Verify that the GPU affinity is set correctly""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + exp_value = [0, 1, 2, 3] + rs.set_gpu_affinity([0, 1, 2, 3]) + assert rs.run_args["gpu-affinity"] == ",".join(str(val) for val in exp_value) + + # ensure the value is not changed when we extend the list + exp_value.extend([4, 5, 6]) + assert rs.run_args["gpu-affinity"] != ",".join(str(val) for val in exp_value) + + rs.set_gpu_affinity(exp_value) + assert rs.run_args["gpu-affinity"] == ",".join(str(val) for val in exp_value) + + # ensure the value is not changed when we extend the list + rs.run_args["gpu-affinity"] = "7,8,9" + assert rs.run_args["gpu-affinity"] != ",".join(str(val) for val in exp_value) diff --git a/tests/test_dragon_step.py b/tests/test_dragon_step.py new file mode 100644 index 0000000000..19f408e0bd --- /dev/null +++ b/tests/test_dragon_step.py @@ -0,0 +1,394 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import pathlib +import shutil +import sys +import typing as t + +import pytest + +from smartsim._core.launcher.step.dragonStep import DragonBatchStep, DragonStep +from smartsim.settings import DragonRunSettings +from smartsim.settings.pbsSettings import QsubBatchSettings +from smartsim.settings.slurmSettings import SbatchSettings + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +from smartsim._core.schemas.dragonRequests import * +from smartsim._core.schemas.dragonResponses import * + + +@pytest.fixture +def dragon_batch_step(test_dir: str) -> DragonBatchStep: + """Fixture for creating a default batch of steps for a dragon launcher""" + test_path = pathlib.Path(test_dir) + + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + # ensure the status_dir is set + status_dir = (test_path / ".smartsim" / "logs").as_posix() + batch_step.meta["status_dir"] = status_dir + + # create some steps to verify the requests file output changes + rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) + rs1 = DragonRunSettings(exe="sleep", exe_args=["2"]) + rs2 = DragonRunSettings(exe="sleep", exe_args=["3"]) + rs3 = DragonRunSettings(exe="sleep", exe_args=["4"]) + + names = "test00", "test01", "test02", "test03" + settings = rs0, rs1, rs2, rs3 + + # create steps with: + # no affinity, cpu affinity only, gpu affinity only, cpu and gpu affinity + cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] + gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] + + # assign some unique affinities to each run setting instance + for index, rs in enumerate(settings): + if gpu_affinities[index]: + rs.set_node_feature("gpu") + rs.set_cpu_affinity(cpu_affinities[index]) + rs.set_gpu_affinity(gpu_affinities[index]) + + steps = list( + DragonStep(name_, test_dir, rs_) for name_, rs_ in zip(names, settings) + ) + + for index, step in enumerate(steps): + # ensure meta is configured... + step.meta["status_dir"] = status_dir + # ... and put all the steps into the batch + batch_step.add_to_batch(steps[index]) + + return batch_step + + +def get_request_path_from_batch_script(launch_cmd: t.List[str]) -> pathlib.Path: + """Helper method for finding the path to a request file from the launch command""" + script_path = pathlib.Path(launch_cmd[-1]) + batch_script = script_path.read_text(encoding="utf-8") + batch_statements = [line for line in batch_script.split("\n") if line] + entrypoint_cmd = batch_statements[-1] + requests_file = pathlib.Path(entrypoint_cmd.split()[-1]) + return requests_file + + +def test_dragon_step_creation(test_dir: str) -> None: + """Verify that the step is created with the values provided""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + original_name = "test" + step = DragonStep(original_name, test_dir, rs) + + # confirm the name has been made unique to avoid conflicts + assert step.name != original_name + assert step.entity_name == original_name + assert step.cwd == test_dir + assert step.step_settings is not None + + +def test_dragon_step_name_uniqueness(test_dir: str) -> None: + """Verify that step name is unique and independent of step content""" + + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + original_name = "test" + + num_steps = 100 + steps = [DragonStep(original_name, test_dir, rs) for _ in range(num_steps)] + + # confirm the name has been made unique in each step + step_names = {step.name for step in steps} + assert len(step_names) == num_steps + + +def test_dragon_step_launch_cmd(test_dir: str) -> None: + """Verify the expected launch cmd is generated w/minimal settings""" + exp_exe = "sleep" + exp_exe_args = "1" + rs = DragonRunSettings(exe=exp_exe, exe_args=[exp_exe_args]) + + original_name = "test" + step = DragonStep(original_name, test_dir, rs) + + launch_cmd = step.get_launch_cmd() + assert len(launch_cmd) == 2 + + # we'll verify the exe_args and exe name are handled correctly + exe, args = launch_cmd + assert exp_exe in exe + assert exp_exe_args in args + + # also, verify that a string exe_args param instead of list is handled correctly + exp_exe_args = "1 2 3" + rs = DragonRunSettings(exe=exp_exe, exe_args=exp_exe_args) + step = DragonStep(original_name, test_dir, rs) + launch_cmd = step.get_launch_cmd() + assert len(launch_cmd) == 4 # "/foo/bar/sleep 1 2 3" + + +def test_dragon_step_launch_cmd_multi_arg(test_dir: str) -> None: + """Verify the expected launch cmd is generated when multiple arguments + are passed to run settings""" + exp_exe = "sleep" + arg0, arg1, arg2 = "1", "2", "3" + rs = DragonRunSettings(exe=exp_exe, exe_args=[arg0, arg1, arg2]) + + original_name = "test" + + step = DragonStep(original_name, test_dir, rs) + + launch_cmd = step.get_launch_cmd() + assert len(launch_cmd) == 4 + + exe, *args = launch_cmd + assert exp_exe in exe + assert arg0 in args + assert arg1 in args + assert arg2 in args + + +def test_dragon_step_launch_cmd_no_bash( + test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Verify that requirement for bash shell is checked""" + exp_exe = "sleep" + arg0, arg1, arg2 = "1", "2", "3" + rs = DragonRunSettings(exe=exp_exe, exe_args=[arg0, arg1, arg2]) + rs.colocated_db_settings = {"foo": "bar"} # triggers bash lookup + + original_name = "test" + step = DragonStep(original_name, test_dir, rs) + + with pytest.raises(RuntimeError) as ex, monkeypatch.context() as ctx: + ctx.setattr(shutil, "which", lambda _: None) + step.get_launch_cmd() + + # verify the exception thrown is the one we're looking for + assert "Could not find" in ex.value.args[0] + + +def test_dragon_step_colocated_db() -> None: + # todo: implement a test for the branch where bash is found and + # run_settings.colocated_db_settings is set + ... + + +def test_dragon_step_container() -> None: + # todo: implement a test for the branch where run_settings.container + # is an instance of class `Singularity` + ... + + +def test_dragon_step_run_settings_accessor(test_dir: str) -> None: + """Verify the run settings passed to the step are copied correctly and + are not inadvertently modified outside the step""" + exp_exe = "sleep" + arg0, arg1, arg2 = "1", "2", "3" + rs = DragonRunSettings(exe=exp_exe, exe_args=[arg0, arg1, arg2]) + + original_name = "test" + step = DragonStep(original_name, test_dir, rs) + rs_output = step.run_settings + + assert rs.exe == rs_output.exe + assert rs.exe_args == rs_output.exe_args + + # ensure we have a deep copy + rs.exe = "foo" + assert id(step.run_settings) != id(rs) + assert step.run_settings.exe != rs.exe + + +def test_dragon_batch_step_creation(test_dir: str) -> None: + """Verify that the batch step is created with the values provided""" + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + # confirm the name has been made unique to avoid conflicts + assert batch_step.name != batch_step_name + assert batch_step.entity_name == batch_step_name + assert batch_step.cwd == test_dir + assert batch_step.batch_settings is not None + assert batch_step.managed + + +def test_dragon_batch_step_add_to_batch(test_dir: str) -> None: + """Verify that steps are added to the batch correctly""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + name0, name1, name2 = "test00", "test01", "test02" + step0 = DragonStep(name0, test_dir, rs) + step1 = DragonStep(name1, test_dir, rs) + step2 = DragonStep(name2, test_dir, rs) + + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + assert len(batch_step.steps) == 0 + + batch_step.add_to_batch(step0) + assert len(batch_step.steps) == 1 + assert name0 in ",".join({step.name for step in batch_step.steps}) + + batch_step.add_to_batch(step1) + assert len(batch_step.steps) == 2 + assert name1 in ",".join({step.name for step in batch_step.steps}) + + batch_step.add_to_batch(step2) + assert len(batch_step.steps) == 3 + assert name2 in ",".join({step.name for step in batch_step.steps}) + + +def test_dragon_batch_step_get_launch_command_meta_fail(test_dir: str) -> None: + """Verify that the batch launch command cannot be generated without + having the status directory set in the step metadata""" + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + with pytest.raises(KeyError) as ex: + batch_step.get_launch_cmd() + + +@pytest.mark.parametrize( + "batch_settings_class,batch_exe,batch_header,node_spec_tpl", + [ + pytest.param( + SbatchSettings, "sbatch", "#SBATCH", "#SBATCH --nodes={0}", id="sbatch" + ), + pytest.param(QsubBatchSettings, "qsub", "#PBS", "#PBS -l nodes={0}", id="qsub"), + ], +) +def test_dragon_batch_step_get_launch_command( + test_dir: str, + batch_settings_class: t.Type, + batch_exe: str, + batch_header: str, + node_spec_tpl: str, +) -> None: + """Verify that the batch launch command is properly generated and + the expected side effects are present (writing script file to disk)""" + test_path = pathlib.Path(test_dir) + + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = batch_settings_class(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + # ensure the status_dir is set + status_dir = (test_path / ".smartsim" / "logs").as_posix() + batch_step.meta["status_dir"] = status_dir + + launch_cmd = batch_step.get_launch_cmd() + assert launch_cmd + + full_cmd = " ".join(launch_cmd) + assert batch_exe in full_cmd # verify launcher running the batch + assert test_dir in full_cmd # verify outputs are sent to expected directory + assert "batch_step.sh" in full_cmd # verify batch script name is in the command + + # ...verify that the script file is written when getting the launch command + script_path = pathlib.Path(launch_cmd[-1]) + assert script_path.exists() + assert len(script_path.read_bytes()) > 0 + + batch_script = script_path.read_text(encoding="utf-8") + + # ...verify the script file has the expected batch script header content + assert batch_header in batch_script + assert node_spec_tpl.format(num_nodes) in batch_script # verify node count is set + + # ...verify the script has the expected entrypoint command + batch_statements = [line for line in batch_script.split("\n") if line] + python_path = sys.executable + + entrypoint_cmd = batch_statements[-1] + assert python_path in entrypoint_cmd + assert "smartsim._core.entrypoints.dragon_client +submit" in entrypoint_cmd + + +def test_dragon_batch_step_write_request_file_no_steps(test_dir: str) -> None: + """Verify that the batch launch command writes an appropriate request file + if no steps are attached""" + test_path = pathlib.Path(test_dir) + + batch_step_name = "batch_step" + num_nodes = 4 + batch_settings = SbatchSettings(nodes=num_nodes) + batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) + + # ensure the status_dir is set + status_dir = (test_path / ".smartsim" / "logs").as_posix() + batch_step.meta["status_dir"] = status_dir + + launch_cmd = batch_step.get_launch_cmd() + requests_file = get_request_path_from_batch_script(launch_cmd) + + # no steps have been added yet, so the requests file should be a serialized, empty list + assert requests_file.read_text(encoding="utf-8") == "[]" + + +def test_dragon_batch_step_write_request_file( + dragon_batch_step: DragonBatchStep, +) -> None: + """Verify that the batch launch command writes an appropriate request file + for the set of attached steps""" + # create steps with: + # no affinity, cpu affinity only, gpu affinity only, cpu and gpu affinity + cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] + gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] + + launch_cmd = dragon_batch_step.get_launch_cmd() + requests_file = get_request_path_from_batch_script(launch_cmd) + + requests_text = requests_file.read_text(encoding="utf-8") + requests_json: t.List[str] = json.loads(requests_text) + + # verify that there is an item in file for each step added to the batch + assert len(requests_json) == len(dragon_batch_step.steps) + + for index, req in enumerate(requests_json): + req_type, req_data = req.split("|", 1) + # the only steps added are to execute apps, requests should be of type "run" + assert req_type == "run" + + run_request = DragonRunRequest(**json.loads(req_data)) + assert run_request + assert run_request.policy.cpu_affinity == cpu_affinities[index] + assert run_request.policy.gpu_affinity == gpu_affinities[index] From 272a1d70271256c8f5ed39bde743dd846e952e0f Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Thu, 18 Jul 2024 09:08:47 -0700 Subject: [PATCH 10/60] Improve error handling in worker manager (#629) This PR aims to allow the `WorkerManager` to continue if a `worker` throws an error. The `WorkerManager` needs to return a `Response` without blowing up in the process. [ committed by @AlyssaCote ] [ approved by @mellis13 @ankona ] --- Makefile | 6 +- doc/changelog.md | 1 + .../infrastructure/control/workermanager.py | 192 +++++++++---- .../_core/mli/infrastructure/worker/worker.py | 9 +- smartsim/_core/mli/message_handler.py | 8 +- .../mli/mli_schemas/response/response.capnp | 5 +- .../mli_schemas/response/response_capnp.pyi | 4 +- tests/dragon/test_error_handling.py | 270 ++++++++++++++++++ tests/dragon/test_reply_building.py | 91 ++++++ tests/dragon/utils/channel.py | 64 +++++ tests/dragon/utils/worker.py | 128 +++++++++ tests/mli/test_worker_manager.py | 1 + 12 files changed, 709 insertions(+), 70 deletions(-) create mode 100644 tests/dragon/test_error_handling.py create mode 100644 tests/dragon/test_reply_building.py create mode 100644 tests/dragon/utils/channel.py create mode 100644 tests/dragon/utils/worker.py diff --git a/Makefile b/Makefile index aaf1736258..3ab83da892 100644 --- a/Makefile +++ b/Makefile @@ -169,17 +169,17 @@ test: # help: test-verbose - Run all tests verbosely .PHONY: test-verbose test-verbose: - @python -m pytest -vv --ignore=tests/full_wlm/ + @python -m pytest -vv --ignore=tests/full_wlm/ --ignore=tests/dragon # help: test-debug - Run all tests with debug output .PHONY: test-debug test-debug: - @SMARTSIM_LOG_LEVEL=developer python -m pytest -s -o log_cli=true -vv --ignore=tests/full_wlm/ + @SMARTSIM_LOG_LEVEL=developer python -m pytest -s -o log_cli=true -vv --ignore=tests/full_wlm/ --ignore=tests/dragon # help: test-cov - Run all tests with coverage .PHONY: test-cov test-cov: - @python -m pytest -vv --cov=./smartsim --cov-config=${COV_FILE} --ignore=tests/full_wlm/ + @python -m pytest -vv --cov=./smartsim --cov-config=${COV_FILE} --ignore=tests/full_wlm/ --ignore=tests/dragon # help: test-full - Run all WLM tests with Python coverage (full test suite) diff --git a/doc/changelog.md b/doc/changelog.md index f4adf1c091..495cff3edd 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -14,6 +14,7 @@ Jump to: Description - Add TorchWorker first implementation and mock inference app example +- Add error handling in Worker Manager pipeline - Add EnvironmentConfigLoader for ML Worker Manager - Add Model schema with model metadata included - Removed device from schemas, MessageHandler and tests diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 8c06351fb5..8e3ed3fb4c 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -51,13 +51,13 @@ MachineLearningWorkerBase, ) from ...message_handler import MessageHandler -from ...mli_schemas.response.response_capnp import Response +from ...mli_schemas.response.response_capnp import Response, ResponseBuilder if t.TYPE_CHECKING: from dragon.fli import FLInterface from smartsim._core.mli.mli_schemas.model.model_capnp import Model - from smartsim._core.mli.mli_schemas.response.response_capnp import StatusEnum + from smartsim._core.mli.mli_schemas.response.response_capnp import Status logger = get_logger(__name__) @@ -98,6 +98,7 @@ def deserialize_message( input_bytes: t.Optional[t.List[bytes]] = ( None # these will really be tensors already ) + output_keys: t.Optional[t.List[str]] = None input_meta: t.List[t.Any] = [] @@ -107,22 +108,26 @@ def deserialize_message( input_bytes = [data.blob for data in request.input.data] input_meta = [data.tensorDescriptor for data in request.input.data] + if request.output: + output_keys = [tensor_key.key for tensor_key in request.output] + inference_request = InferenceRequest( model_key=model_key, callback=comm_channel, raw_inputs=input_bytes, - input_meta=input_meta, input_keys=input_keys, + input_meta=input_meta, + output_keys=output_keys, raw_model=model_bytes, batch_size=0, ) return inference_request -def build_failure_reply(status: "StatusEnum", message: str) -> Response: +def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: return MessageHandler.build_response( - status=status, # todo: need to indicate correct status - message=message, # todo: decide what these will be + status=status, + message=message, result=[], custom_attributes=None, ) @@ -154,17 +159,39 @@ def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: return prepared_outputs -def build_reply(reply: InferenceReply) -> Response: +def build_reply(reply: InferenceReply) -> ResponseBuilder: results = prepare_outputs(reply) return MessageHandler.build_response( - status="complete", - message="success", + status=reply.status_enum, + message=reply.message, result=results, custom_attributes=None, ) +def exception_handler( + exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str +) -> None: + """ + Logs exceptions and sends a failure response. + + :param exc: The exception to be logged + :param reply_channel: The channel used to send replies + :param failure_message: Failure message to log and send back + """ + logger.exception( + f"{failure_message}\n" + f"Exception type: {type(exc).__name__}\n" + f"Exception message: {str(exc)}" + ) + serialized_resp = MessageHandler.serialize_response( + build_failure_reply("fail", failure_message) + ) + if reply_channel: + reply_channel.send(serialized_resp) + + class WorkerManager(Service): """An implementation of a service managing distribution of tasks to machine learning workers""" @@ -258,96 +285,147 @@ def _on_iteration(self) -> None: timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing + reply = InferenceReply() + if not request.raw_model: if request.model_key is None: - # A valid request should never get here. - raise ValueError("Could not read model key") + exception_handler( + ValueError("Could not find model key or model"), + request.callback, + "Could not find model key or model.", + ) + return if request.model_key in self._cached_models: timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing model_result = LoadModelResult(self._cached_models[request.model_key]) else: - fetch_model_result = None - while True: - try: - interm = time.perf_counter() # timing - fetch_model_result = self._worker.fetch_model( - request, self._feature_store - ) - except KeyError: - time.sleep(0.1) - else: - break - - if fetch_model_result is None: - raise SmartSimError("Could not retrieve model from feature store") timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing + try: + fetch_model_result = self._worker.fetch_model( + request, self._feature_store + ) + except Exception as e: + exception_handler( + e, request.callback, "Failed while fetching the model." + ) + return + + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing + try: + model_result = self._worker.load_model( + request, + fetch_result=fetch_model_result, + device=self._device, + ) + self._cached_models[request.model_key] = model_result.model + except Exception as e: + exception_handler( + e, request.callback, "Failed while loading the model." + ) + return + + else: + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing + try: + fetch_model_result = self._worker.fetch_model( + request, self._feature_store + ) + except Exception as e: + exception_handler( + e, request.callback, "Failed while fetching the model." + ) + return + + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing + try: model_result = self._worker.load_model( - request, fetch_model_result, self._device + request, fetch_result=fetch_model_result, device=self._device ) - self._cached_models[request.model_key] = model_result.model - else: - fetch_model_result = self._worker.fetch_model(request, None) - model_result = self._worker.load_model( - request, fetch_result=fetch_model_result, device=self._device - ) + except Exception as e: + exception_handler( + e, request.callback, "Failed while loading the model." + ) + return timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing - fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) + try: + fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) + except Exception as e: + exception_handler(e, request.callback, "Failed while fetching the inputs.") + return timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing - transformed_input = self._worker.transform_input( - request, fetch_input_result, self._device - ) + try: + transformed_input = self._worker.transform_input( + request, fetch_input_result, self._device + ) + except Exception as e: + exception_handler( + e, request.callback, "Failed while transforming the input." + ) + return timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing - - reply = InferenceReply() - try: execute_result = self._worker.execute( request, model_result, transformed_input ) + except Exception as e: + exception_handler(e, request.callback, "Failed while executing.") + return - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing + try: transformed_output = self._worker.transform_output( request, execute_result, self._device ) + except Exception as e: + exception_handler( + e, request.callback, "Failed while transforming the output." + ) + return - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - if request.output_keys: + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing + if request.output_keys: + try: reply.output_keys = self._worker.place_output( - request, transformed_output, self._feature_store + request, + transformed_output, + self._feature_store, ) - else: - reply.outputs = transformed_output.outputs - except Exception: - logger.exception("Error executing worker") - reply.failed = True + except Exception as e: + exception_handler( + e, request.callback, "Failed while placing the output." + ) + return + else: + reply.outputs = transformed_output.outputs timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing - if reply.failed: - response = build_failure_reply("fail", "failure-occurred") + if reply.outputs is None or not reply.outputs: + response = build_failure_reply("fail", "Outputs not found.") else: - if reply.outputs is None or not reply.outputs: - response = build_failure_reply("fail", "no-results") - + reply.status_enum = "complete" + reply.message = "Success" response = build_reply(reply) timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing - # serialized = self._worker.serialize_reply(request, transformed_output) - serialized_resp = MessageHandler.serialize_response(response) # type: ignore + serialized_resp = MessageHandler.serialize_response(response) timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 900a8241de..dd874abe39 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -33,6 +33,9 @@ from ...infrastructure.storage.featurestore import FeatureStore from ...mli_schemas.model.model_capnp import Model +if t.TYPE_CHECKING: + from smartsim._core.mli.mli_schemas.response.response_capnp import Status + logger = get_logger(__name__) @@ -70,12 +73,14 @@ def __init__( self, outputs: t.Optional[t.Collection[t.Any]] = None, output_keys: t.Optional[t.Collection[str]] = None, - failed: bool = False, + status_enum: "Status" = "running", + message: str = "In progress", ) -> None: """Initialize the object""" self.outputs: t.Collection[t.Any] = outputs or [] self.output_keys: t.Collection[t.Optional[str]] = output_keys or [] - self.failed = failed + self.status_enum = status_enum + self.message = message class LoadModelResult: diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index bcf1cfdf14..4fe2bef3a7 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -360,7 +360,7 @@ def build_request( request_attributes_capnp.TensorFlowRequestAttributes, None, ], - ) -> request_capnp.Request: + ) -> request_capnp.RequestBuilder: """ Builds the request message. @@ -405,7 +405,7 @@ def deserialize_request(request_bytes: bytes) -> request_capnp.Request: @staticmethod def _assign_status( - response: response_capnp.Response, status: "response_capnp.StatusEnum" + response: response_capnp.Response, status: "response_capnp.Status" ) -> None: """ Assigns a status to the supplied response. @@ -498,7 +498,7 @@ def _assign_custom_response_attributes( @staticmethod def build_response( - status: "response_capnp.StatusEnum", + status: "response_capnp.Status", message: str, result: t.Union[ t.List[tensor_capnp.Tensor], t.List[data_references_capnp.TensorKey] @@ -508,7 +508,7 @@ def build_response( response_attributes_capnp.TensorFlowResponseAttributes, None, ], - ) -> response_capnp.Response: + ) -> response_capnp.ResponseBuilder: """ Builds the response message. diff --git a/smartsim/_core/mli/mli_schemas/response/response.capnp b/smartsim/_core/mli/mli_schemas/response/response.capnp index 67375b5a97..83aa05a41b 100644 --- a/smartsim/_core/mli/mli_schemas/response/response.capnp +++ b/smartsim/_core/mli/mli_schemas/response/response.capnp @@ -30,14 +30,15 @@ using Tensors = import "../tensor/tensor.capnp"; using ResponseAttributes = import "response_attributes/response_attributes.capnp"; using DataRef = import "../data/data_references.capnp"; -enum StatusEnum { +enum Status { complete @0; fail @1; timeout @2; + running @3; } struct Response { - status @0 :StatusEnum; + status @0 :Status; message @1 :Text; result :union { keys @2 :List(DataRef.TensorKey); diff --git a/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi b/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi index f6d7f8444e..f19bdefe04 100644 --- a/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi @@ -45,7 +45,7 @@ from .response_attributes.response_attributes_capnp import ( TorchResponseAttributesReader, ) -StatusEnum = Literal["complete", "fail", "timeout"] +Status = Literal["complete", "fail", "timeout", "running"] class Response: class Result: @@ -150,7 +150,7 @@ class Response: def write(file: BufferedWriter) -> None: ... @staticmethod def write_packed(file: BufferedWriter) -> None: ... - status: StatusEnum + status: Status message: str result: Response.Result | Response.ResultBuilder | Response.ResultReader customAttributes: ( diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py new file mode 100644 index 0000000000..151bdd2fcc --- /dev/null +++ b/tests/dragon/test_error_handling.py @@ -0,0 +1,270 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import base64 +import pickle +from unittest.mock import MagicMock + +import pytest + +dragon = pytest.importorskip("dragon") + +import dragon.utils as du +from dragon.channels import Channel +from dragon.data.ddict.ddict import DDict +from dragon.fli import FLInterface + +from smartsim._core.mli.infrastructure.control.workermanager import ( + WorkerManager, + exception_handler, +) +from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( + DragonFeatureStore, +) +from smartsim._core.mli.infrastructure.worker.worker import ( + ExecuteResult, + FetchInputResult, + FetchModelResult, + InferenceReply, + LoadModelResult, + TransformInputResult, + TransformOutputResult, +) +from smartsim._core.mli.message_handler import MessageHandler + +from .utils.channel import FileSystemCommChannel +from .utils.worker import IntegratedTorchWorker + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + + +@pytest.fixture +def setup_worker_manager_model_bytes(test_dir, monkeypatch: pytest.MonkeyPatch): + integrated_worker = IntegratedTorchWorker() + + chan = Channel.make_process_local() + queue = FLInterface(main_ch=chan) + monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) + storage = DDict() + feature_store = DragonFeatureStore(storage) + monkeypatch.setenv( + "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8") + ) + + worker_manager = WorkerManager( + EnvironmentConfigLoader(), + integrated_worker, + as_service=False, + cooldown=3, + comm_channel_type=FileSystemCommChannel, + ) + + tensor_key = MessageHandler.build_tensor_key("key") + model = MessageHandler.build_model(b"model", "model name", "v 0.0.1") + request = MessageHandler.build_request( + test_dir, model, [tensor_key], [tensor_key], [], None + ) + ser_request = MessageHandler.serialize_request(request) + worker_manager._task_queue.send(ser_request) + + return worker_manager, integrated_worker + + +@pytest.fixture +def setup_worker_manager_model_key(test_dir, monkeypatch: pytest.MonkeyPatch): + integrated_worker = IntegratedTorchWorker() + + chan = Channel.make_process_local() + queue = FLInterface(main_ch=chan) + monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) + storage = DDict() + feature_store = DragonFeatureStore(storage) + monkeypatch.setenv( + "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8") + ) + + worker_manager = WorkerManager( + EnvironmentConfigLoader(), + integrated_worker, + as_service=False, + cooldown=3, + comm_channel_type=FileSystemCommChannel, + ) + + tensor_key = MessageHandler.build_tensor_key("key") + model_key = MessageHandler.build_model_key("model key") + request = MessageHandler.build_request( + test_dir, model_key, [tensor_key], [tensor_key], [], None + ) + ser_request = MessageHandler.serialize_request(request) + worker_manager._task_queue.send(ser_request) + + return worker_manager, integrated_worker + + +def mock_pipeline_stage(monkeypatch: pytest.MonkeyPatch, integrated_worker, stage): + def mock_stage(*args, **kwargs): + raise ValueError(f"Simulated error in {stage}") + + monkeypatch.setattr(integrated_worker, stage, mock_stage) + mock_reply_fn = MagicMock() + monkeypatch.setattr( + "smartsim._core.mli.infrastructure.control.workermanager.build_failure_reply", + mock_reply_fn, + ) + + def mock_exception_handler(exc, reply_channel, failure_message): + return exception_handler(exc, None, failure_message) + + monkeypatch.setattr( + "smartsim._core.mli.infrastructure.control.workermanager.exception_handler", + mock_exception_handler, + ) + + return mock_reply_fn + + +@pytest.mark.parametrize( + "setup_worker_manager", + [ + pytest.param("setup_worker_manager_model_bytes"), + pytest.param("setup_worker_manager_model_key"), + ], +) +@pytest.mark.parametrize( + "stage, error_message", + [ + pytest.param( + "fetch_model", "Failed while fetching the model.", id="fetch model" + ), + pytest.param("load_model", "Failed while loading the model.", id="load model"), + pytest.param( + "fetch_inputs", "Failed while fetching the inputs.", id="fetch inputs" + ), + pytest.param( + "transform_input", + "Failed while transforming the input.", + id="transform inputs", + ), + pytest.param("execute", "Failed while executing.", id="execute"), + pytest.param( + "transform_output", + "Failed while transforming the output.", + id="transform output", + ), + pytest.param( + "place_output", "Failed while placing the output.", id="place output" + ), + ], +) +def test_pipeline_stage_errors_handled( + request, + setup_worker_manager, + monkeypatch: pytest.MonkeyPatch, + stage: str, + error_message: str, +): + """Ensures that the worker manager does not crash after a failure in various pipeline stages""" + worker_manager, integrated_worker = request.getfixturevalue(setup_worker_manager) + mock_reply_fn = mock_pipeline_stage(monkeypatch, integrated_worker, stage) + + if stage not in ["fetch_model"]: + monkeypatch.setattr( + integrated_worker, + "fetch_model", + MagicMock(return_value=FetchModelResult(b"result_bytes")), + ) + + if stage not in ["fetch_model", "load_model"]: + monkeypatch.setattr( + integrated_worker, + "load_model", + MagicMock(return_value=LoadModelResult(b"result_bytes")), + ) + if stage not in ["fetch_model", "load_model", "fetch_inputs"]: + monkeypatch.setattr( + integrated_worker, + "fetch_inputs", + MagicMock(return_value=FetchInputResult([b"result_bytes"], None)), + ) + if stage not in ["fetch_model", "load_model", "fetch_inputs", "transform_input"]: + monkeypatch.setattr( + integrated_worker, + "transform_input", + MagicMock(return_value=TransformInputResult(b"result_bytes")), + ) + if stage not in [ + "fetch_model", + "load_model", + "fetch_inputs", + "transform_input", + "execute", + ]: + monkeypatch.setattr( + integrated_worker, + "execute", + MagicMock(return_value=ExecuteResult(b"result_bytes")), + ) + if stage not in [ + "fetch_model", + "load_model", + "fetch_inputs", + "transform_input", + "execute", + "transform_output", + ]: + monkeypatch.setattr( + integrated_worker, + "transform_output", + MagicMock( + return_value=TransformOutputResult(b"result", [], "c", "float32") + ), + ) + + worker_manager._on_iteration() + + mock_reply_fn.assert_called_once() + mock_reply_fn.assert_called_with("fail", error_message) + + +def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch): + """Ensures that the worker manager does not crash after a failure in the + execute pipeline stage""" + reply = InferenceReply() + + mock_reply_fn = MagicMock() + monkeypatch.setattr( + "smartsim._core.mli.infrastructure.control.workermanager.build_failure_reply", + mock_reply_fn, + ) + + test_exception = ValueError("Test ValueError") + exception_handler(test_exception, None, "Failure while fetching the model.") + + mock_reply_fn.assert_called_once() + mock_reply_fn.assert_called_with("fail", "Failure while fetching the model.") diff --git a/tests/dragon/test_reply_building.py b/tests/dragon/test_reply_building.py new file mode 100644 index 0000000000..d1c4d226bb --- /dev/null +++ b/tests/dragon/test_reply_building.py @@ -0,0 +1,91 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + +import pytest + +dragon = pytest.importorskip("dragon") + +from smartsim._core.mli.infrastructure.control.workermanager import ( + build_failure_reply, + build_reply, +) +from smartsim._core.mli.infrastructure.worker.worker import InferenceReply + +if t.TYPE_CHECKING: + from smartsim._core.mli.mli_schemas.response.response_capnp import Status + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + + +@pytest.mark.parametrize( + "status, message", + [ + pytest.param("timeout", "Worker timed out", id="timeout"), + pytest.param("fail", "Failed while executing", id="fail"), + ], +) +def test_build_failure_reply(status: "Status", message: str): + "Ensures failure replies can be built successfully" + response = build_failure_reply(status, message) + assert response.status == status + assert response.message == message + + +def test_build_failure_reply_fails(): + "Ensures ValueError is raised if a Status Enum is not used" + with pytest.raises(ValueError) as ex: + response = build_failure_reply("not a status enum", "message") + + assert "Error assigning status to response" in ex.value.args[0] + + +@pytest.mark.parametrize( + "status, message", + [ + pytest.param("complete", "Success", id="complete"), + ], +) +def test_build_reply(status: "Status", message: str): + "Ensures replies can be built successfully" + reply = InferenceReply() + reply.status_enum = status + reply.message = message + response = build_reply(reply) + assert response.status == status + assert response.message == message + + +def test_build_reply_fails(): + "Ensures ValueError is raised if a Status Enum is not used" + with pytest.raises(ValueError) as ex: + reply = InferenceReply() + reply.status_enum = "not a status enum" + response = build_reply(reply) + + assert "Error assigning status to response" in ex.value.args[0] diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py new file mode 100644 index 0000000000..df76c484b5 --- /dev/null +++ b/tests/dragon/utils/channel.py @@ -0,0 +1,64 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib +import typing as t + +from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class FileSystemCommChannel(CommChannelBase): + """Passes messages by writing to a file""" + + def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: + """Initialize the FileSystemCommChannel instance""" + if not isinstance(key, bytes): + super().__init__(key.as_posix().encode("utf-8")) + self._file_path = key + else: + super().__init__(key) + self._file_path = pathlib.Path(key.decode("utf-8")) + + if not self._file_path.parent.exists(): + self._file_path.parent.mkdir(parents=True) + + self._file_path.touch() + + def send(self, value: bytes) -> None: + """Send a message throuh the underlying communication channel + :param value: The value to send""" + logger.debug( + f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}" + ) + self._file_path.write_bytes(value) + + def recv(self) -> bytes: + """Receieve a message through the underlying communication channel + :returns: the received message""" + ... diff --git a/tests/dragon/utils/worker.py b/tests/dragon/utils/worker.py new file mode 100644 index 0000000000..b1de280185 --- /dev/null +++ b/tests/dragon/utils/worker.py @@ -0,0 +1,128 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io +import typing as t + +import torch + +import smartsim._core.mli.infrastructure.worker.worker as mliw +import smartsim.error as sse +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class IntegratedTorchWorker(mliw.MachineLearningWorkerBase): + """A minimum implementation of a worker that executes a PyTorch model""" + + # @staticmethod + # def deserialize(request: InferenceRequest) -> t.List[t.Any]: + # # request.input_meta + # # request.raw_inputs + # return request + + @staticmethod + def load_model( + request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult + ) -> mliw.LoadModelResult: + model_bytes = fetch_result.model_bytes or request.raw_model + if not model_bytes: + raise ValueError("Unable to load model without reference object") + + model: torch.nn.Module = torch.load(io.BytesIO(model_bytes)) + result = mliw.LoadModelResult(model) + return result + + @staticmethod + def transform_input( + request: mliw.InferenceRequest, + fetch_result: mliw.FetchInputResult, + ) -> mliw.TransformInputResult: + # extra metadata for assembly can be found in request.input_meta + raw_inputs = request.raw_inputs or fetch_result.inputs + + result: t.List[torch.Tensor] = [] + # should this happen here? + # consider - fortran to c data layout + # is there an intermediate representation before really doing torch.load? + if raw_inputs: + result = [torch.load(io.BytesIO(item)) for item in raw_inputs] + + return mliw.TransformInputResult(result) + + @staticmethod + def execute( + request: mliw.InferenceRequest, + load_result: mliw.LoadModelResult, + transform_result: mliw.TransformInputResult, + ) -> mliw.ExecuteResult: + if not load_result.model: + raise sse.SmartSimError("Model must be loaded to execute") + + model = load_result.model + results = [model(tensor) for tensor in transform_result.transformed] + + execute_result = mliw.ExecuteResult(results) + return execute_result + + @staticmethod + def transform_output( + request: mliw.InferenceRequest, + execute_result: mliw.ExecuteResult, + ) -> mliw.TransformOutputResult: + # transformed = [item.clone() for item in execute_result.predictions] + # return OutputTransformResult(transformed) + + # transformed = [item.bytes() for item in execute_result.predictions] + + # OutputTransformResult.transformed SHOULD be a list of + # capnproto Tensors Or tensor descriptors accompanying bytes + + # send the original tensors... + execute_result.predictions = [t.detach() for t in execute_result.predictions] + # todo: solve sending all tensor metadata that coincisdes with each prediction + return mliw.TransformOutputResult( + execute_result.predictions, [1], "c", "float32" + ) + # return OutputTransformResult(transformed) + + # @staticmethod + # def serialize_reply( + # request: InferenceRequest, results: OutputTransformResult + # ) -> t.Any: + # # results = IntegratedTorchWorker._prepare_outputs(results.outputs) + # # return results + # return None + # # response = MessageHandler.build_response( + # # status=200, # todo: are we satisfied with 0/1 (success, fail) + # # # todo: if not detailed messages, this shouldn't be returned. + # # message="success", + # # result=results, + # # custom_attributes=None, + # # ) + # # serialized_resp = MessageHandler.serialize_response(response) + # # return serialized_resp diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index 7b345f9ef1..df4b0a637f 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -149,6 +149,7 @@ def mock_messages( model=message_model_key, inputs=[message_tensor_input_key], outputs=[message_tensor_output_key], + output_descriptors=[], custom_attributes=None, ) request_bytes = MessageHandler.serialize_request(request) From 7169f1c7298dfd497e89aab87d08d15dbc475eb7 Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Thu, 18 Jul 2024 14:15:08 -0700 Subject: [PATCH 11/60] Schema performance improvements (#632) Schemas were enhanced for performance. [committed by @AlyssaCote ] [approved by @al-rigazzi @mellis13 ] --- doc/changelog.md | 1 + ex/high_throughput_inference/mock_app.py | 18 +- smartsim/_core/mli/comm/channel/channel.py | 2 +- .../_core/mli/comm/channel/dragonchannel.py | 5 +- smartsim/_core/mli/comm/channel/dragonfli.py | 17 +- .../infrastructure/control/workermanager.py | 52 +- .../mli/infrastructure/worker/torch_worker.py | 10 +- .../_core/mli/infrastructure/worker/worker.py | 2 +- smartsim/_core/mli/message_handler.py | 46 +- .../mli/mli_schemas/request/request.capnp | 4 +- .../mli/mli_schemas/request/request_capnp.pyi | 20 +- .../mli/mli_schemas/response/response.capnp | 2 +- .../mli_schemas/response/response_capnp.pyi | 18 +- .../_core/mli/mli_schemas/tensor/tensor.capnp | 7 +- .../mli/mli_schemas/tensor/tensor_capnp.py | 3 - .../mli/mli_schemas/tensor/tensor_capnp.pyi | 43 -- tests/mli/test_torch_worker.py | 13 +- .../test_message_handler/test_build_tensor.py | 185 ------- .../test_build_tensor_desc.py | 90 ++++ tests/test_message_handler/test_request.py | 495 ++---------------- tests/test_message_handler/test_response.py | 248 ++------- 21 files changed, 316 insertions(+), 965 deletions(-) delete mode 100644 tests/test_message_handler/test_build_tensor.py create mode 100644 tests/test_message_handler/test_build_tensor_desc.py diff --git a/doc/changelog.md b/doc/changelog.md index 495cff3edd..1c91705add 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Adjust schemas for better performance - Add TorchWorker first implementation and mock inference app example - Add error handling in Worker Manager pipeline - Add EnvironmentConfigLoader for ML Worker Manager diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 45246db2e5..e244c93e0f 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -108,10 +108,11 @@ def print_timings(self, to_file: bool = False): def run_model(self, model: bytes | str, batch: torch.Tensor): + tensors = [batch.numpy()] self.start_timings(batch.shape[0]) - built_tensor = MessageHandler.build_tensor( - batch.numpy(), "c", "float32", list(batch.shape)) - self.measure_time("build_tensor") + built_tensor_desc = MessageHandler.build_tensor_descriptor( + "c", "float32", list(batch.shape)) + self.measure_time("build_tensor_descriptor") built_model = None if isinstance(model, str): model_arg = MessageHandler.build_model_key(model) @@ -120,7 +121,7 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): request = MessageHandler.build_request( reply_channel=self._from_worker_ch_serialized, model= model_arg, - inputs=[built_tensor], + inputs=[built_tensor_desc], outputs=[], output_descriptors=[], custom_attributes=None, @@ -130,6 +131,9 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.measure_time("serialize_request") with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) + for t in tensors: + to_sendh.send_bytes(t.tobytes()) #TODO NOT FAST ENOUGH!!! + # to_sendh.send_bytes(bytes(t.data)) logger.info(f"Message size: {len(request_bytes)} bytes") self.measure_time("send") @@ -138,10 +142,12 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.measure_time("receive") response = MessageHandler.deserialize_response(resp) self.measure_time("deserialize_response") + # list of data blobs? recv depending on the len(response.result.descriptors)? + data_blob = from_recvh.recv_bytes(timeout=None) result = torch.from_numpy( numpy.frombuffer( - response.result.data[0].blob, - dtype=str(response.result.data[0].tensorDescriptor.dataType), + data_blob, + dtype=str(response.result.descriptors[0].dataType), ) ) self.measure_time("deserialize_tensor") diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index 2318896a9b..a3cce21814 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -45,7 +45,7 @@ def send(self, value: bytes) -> None: :param value: The value to send""" @abstractmethod - def recv(self) -> bytes: + def recv(self) -> t.List[bytes]: """Receieve a message through the underlying communication channel :returns: the received message""" diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index 1409747a91..672fce75b2 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import sys +import typing as t import smartsim._core.mli.comm.channel.channel as cch from smartsim.log import get_logger @@ -52,9 +53,9 @@ def send(self, value: bytes) -> None: with self._channel.sendh(timeout=None) as sendh: sendh.send_bytes(value) - def recv(self) -> bytes: + def recv(self) -> t.List[bytes]: """Receieve a message through the underlying communication channel :returns: the received message""" with self._channel.recvh(timeout=None) as recvh: message_bytes: bytes = recvh.recv_bytes(timeout=None) - return message_bytes + return [message_bytes] diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 75f8fb4bfc..28b4c2bf3b 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -57,13 +57,16 @@ def send(self, value: bytes) -> None: with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: sendh.send_bytes(value) - def recv(self) -> bytes: + def recv(self) -> t.List[bytes]: """Receieve a message through the underlying communication channel :returns: the received message""" + messages = [] + eot = False with self._fli.recvh(timeout=None) as recvh: - try: - request_bytes: bytes - request_bytes, _ = recvh.recv_bytes(timeout=None) - return request_bytes - except fli.FLIEOT as exc: - return b"" + while not eot: + try: + message, _ = recvh.recv_bytes(timeout=None) + messages.append(message) + except fli.FLIEOT as exc: + eot = True + return messages diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 8e3ed3fb4c..27f5bfc971 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -58,6 +58,7 @@ from smartsim._core.mli.mli_schemas.model.model_capnp import Model from smartsim._core.mli.mli_schemas.response.response_capnp import Status + from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor logger = get_logger(__name__) @@ -88,25 +89,23 @@ def deserialize_message( elif request.model.which() == "data": model_bytes = request.model.data - callback_key = request.replyChannel.reply + callback_key = request.replyChannel.descriptor # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel` comm_channel = channel_type(callback_key) # comm_channel = DragonCommChannel(request.replyChannel) input_keys: t.Optional[t.List[str]] = None - input_bytes: t.Optional[t.List[bytes]] = ( - None # these will really be tensors already - ) + input_bytes: t.Optional[t.List[bytes]] = None + output_keys: t.Optional[t.List[str]] = None - input_meta: t.List[t.Any] = [] + input_meta: t.Optional[t.List[TensorDescriptor]] = None if request.input.which() == "keys": input_keys = [input_key.key for input_key in request.input.keys] - elif request.input.which() == "data": - input_bytes = [data.blob for data in request.input.data] - input_meta = [data.tensorDescriptor for data in request.input.data] + elif request.input.which() == "descriptors": + input_meta = request.input.descriptors # type: ignore if request.output: output_keys = [tensor_key.key for tensor_key in request.output] @@ -142,20 +141,13 @@ def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: msg_key = MessageHandler.build_tensor_key(key) prepared_outputs.append(msg_key) elif reply.outputs: - arrays: t.List[np.ndarray[t.Any, np.dtype[t.Any]]] = [ - output.numpy() for output in reply.outputs - ] - for tensor in arrays: - # todo: need to have the output attributes specified in the req? - # maybe, add `MessageHandler.dtype_of(tensor)`? - # can `build_tensor` do dtype and shape? - msg_tensor = MessageHandler.build_tensor( - tensor, + for _ in reply.outputs: + msg_tensor_desc = MessageHandler.build_tensor_descriptor( "c", "float32", [1], ) - prepared_outputs.append(msg_tensor) + prepared_outputs.append(msg_tensor_desc) return prepared_outputs @@ -272,13 +264,28 @@ def _on_iteration(self) -> None: return timings = [] # timing - # perform default deserialization of the message envelope - request_bytes: bytes = self._task_queue.recv() + + bytes_list: t.List[bytes] = self._task_queue.recv() + + if not bytes_list: + exception_handler( + ValueError("No request data found"), + None, + "No request data found.", + ) + return + + request_bytes = bytes_list[0] + tensor_bytes_list = bytes_list[1:] interm = time.perf_counter() # timing request = deserialize_message( request_bytes, self._comm_channel_type, self._device ) + + if request.input_meta and tensor_bytes_list: + request.raw_inputs = tensor_bytes_list + if not self._validate_request(request): return @@ -430,7 +437,12 @@ def _on_iteration(self) -> None: timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing if request.callback: + # send serialized response request.callback.send(serialized_resp) + if reply.outputs: + # send tensor data after response + for output in reply.outputs: + request.callback.send(output) timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index a4e725ab99..e732ecd2cd 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -110,10 +110,16 @@ def transform_output( result_device: str, ) -> TransformOutputResult: if result_device != "cpu": - transformed = [item.to("cpu") for item in execute_result.predictions] + transformed = [ + item.to("cpu").numpy().tobytes() for item in execute_result.predictions + ] + # todo: need the shape from latest schemas added here. return TransformOutputResult(transformed, None, "c", "float32") # fixme return TransformOutputResult( - execute_result.predictions, None, "c", "float32" + [item.numpy().tobytes() for item in execute_result.predictions], + None, + "c", + "float32", ) # fixme diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index dd874abe39..bb8d822311 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -59,7 +59,7 @@ def __init__( self.model_key = model_key self.raw_model = raw_model self.callback = callback - self.raw_inputs = raw_inputs + self.raw_inputs = raw_inputs or [] self.input_keys = input_keys or [] self.input_meta = input_meta or [] self.output_keys = output_keys or [] diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index 4fe2bef3a7..00670dce8a 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -25,8 +25,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import typing as t -import numpy as np - from .mli_schemas.data import data_references_capnp from .mli_schemas.model import model_capnp from .mli_schemas.request import request_capnp @@ -38,17 +36,15 @@ class MessageHandler: @staticmethod - def build_tensor( - tensor: np.ndarray[t.Any, np.dtype[t.Any]], + def build_tensor_descriptor( order: "tensor_capnp.Order", data_type: "tensor_capnp.NumericalType", dimensions: t.List[int], - ) -> tensor_capnp.Tensor: + ) -> tensor_capnp.TensorDescriptor: """ - Builds a Tensor message using the provided data, + Builds a TensorDescriptor message using the provided order, data type, and dimensions. - :param tensor: Tensor to build the message around :param order: Order of the tensor, such as row-major (c) or column-major (f) :param data_type: Data type of the tensor :param dimensions: Dimensions of the tensor @@ -59,15 +55,12 @@ def build_tensor( description.order = order description.dataType = data_type description.dimensions = dimensions - built_tensor = tensor_capnp.Tensor.new_message() - built_tensor.blob = tensor.tobytes() # tensor channel instead? - built_tensor.tensorDescriptor = description except Exception as e: raise ValueError( - "Error building tensor." + "Error building tensor descriptor." ) from e # TODO: create custom exception - return built_tensor + return description @staticmethod def build_output_tensor_descriptor( @@ -240,7 +233,7 @@ def _assign_reply_channel( :raises ValueError: if building fails """ try: - request.replyChannel.reply = reply_channel + request.replyChannel.descriptor = reply_channel except Exception as e: raise ValueError("Error building reply channel portion of request.") from e @@ -248,7 +241,8 @@ def _assign_reply_channel( def _assign_inputs( request: request_capnp.Request, inputs: t.Union[ - t.List[data_references_capnp.TensorKey], t.List[tensor_capnp.Tensor] + t.List[data_references_capnp.TensorKey], + t.List[tensor_capnp.TensorDescriptor], ], ) -> None: """ @@ -262,14 +256,13 @@ def _assign_inputs( if inputs: display_name = inputs[0].schema.node.displayName # type: ignore input_class_name = display_name.split(":")[-1] - if input_class_name == "Tensor": - request.input.data = inputs # type: ignore + if input_class_name == "TensorDescriptor": + request.input.descriptors = inputs # type: ignore elif input_class_name == "TensorKey": request.input.keys = inputs # type: ignore else: - raise ValueError( - "Invalid input class name. Expected 'Tensor' or 'TensorKey'." - ) + raise ValueError("""Invalid input class name. Expected + 'TensorDescriptor' or 'TensorKey'.""") except Exception as e: raise ValueError("Error building inputs portion of request.") from e @@ -351,7 +344,8 @@ def build_request( reply_channel: bytes, model: t.Union[data_references_capnp.ModelKey, model_capnp.Model], inputs: t.Union[ - t.List[data_references_capnp.TensorKey], t.List[tensor_capnp.Tensor] + t.List[data_references_capnp.TensorKey], + t.List[tensor_capnp.TensorDescriptor], ], outputs: t.List[data_references_capnp.TensorKey], output_descriptors: t.List[tensor_capnp.OutputDescriptor], @@ -437,7 +431,8 @@ def _assign_message(response: response_capnp.Response, message: str) -> None: def _assign_result( response: response_capnp.Response, result: t.Union[ - t.List[tensor_capnp.Tensor], t.List[data_references_capnp.TensorKey] + t.List[tensor_capnp.TensorDescriptor], + t.List[data_references_capnp.TensorKey], ], ) -> None: """ @@ -452,13 +447,13 @@ def _assign_result( first_result = result[0] display_name = first_result.schema.node.displayName # type: ignore result_class_name = display_name.split(":")[-1] - if result_class_name == "Tensor": - response.result.data = result # type: ignore + if result_class_name == "TensorDescriptor": + response.result.descriptors = result # type: ignore elif result_class_name == "TensorKey": response.result.keys = result # type: ignore else: raise ValueError("""Invalid custom attribute class name. - Expected 'Tensor' or 'TensorKey'.""") + Expected 'TensorDescriptor' or 'TensorKey'.""") except Exception as e: raise ValueError("Error assigning result to response.") from e @@ -501,7 +496,8 @@ def build_response( status: "response_capnp.Status", message: str, result: t.Union[ - t.List[tensor_capnp.Tensor], t.List[data_references_capnp.TensorKey] + t.List[tensor_capnp.TensorDescriptor], + t.List[data_references_capnp.TensorKey], ], custom_attributes: t.Union[ response_attributes_capnp.TorchResponseAttributes, diff --git a/smartsim/_core/mli/mli_schemas/request/request.capnp b/smartsim/_core/mli/mli_schemas/request/request.capnp index f9508cb54f..4be1cfa215 100644 --- a/smartsim/_core/mli/mli_schemas/request/request.capnp +++ b/smartsim/_core/mli/mli_schemas/request/request.capnp @@ -32,7 +32,7 @@ using DataRef = import "../data/data_references.capnp"; using Models = import "../model/model.capnp"; struct ChannelDescriptor { - reply @0 :Data; + descriptor @0 :Data; } struct Request { @@ -43,7 +43,7 @@ struct Request { } input :union { keys @3 :List(DataRef.TensorKey); - data @4 :List(Tensors.Tensor); + descriptors @4 :List(Tensors.TensorDescriptor); } output @5 :List(DataRef.TensorKey); outputDescriptors @6 :List(Tensors.OutputDescriptor); diff --git a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi index 39093f61ad..a4ad631f9f 100644 --- a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi @@ -47,9 +47,9 @@ from ..tensor.tensor_capnp import ( OutputDescriptor, OutputDescriptorBuilder, OutputDescriptorReader, - Tensor, - TensorBuilder, - TensorReader, + TensorDescriptor, + TensorDescriptorBuilder, + TensorDescriptorReader, ) from .request_attributes.request_attributes_capnp import ( TensorFlowRequestAttributes, @@ -61,7 +61,7 @@ from .request_attributes.request_attributes_capnp import ( ) class ChannelDescriptor: - reply: bytes + descriptor: bytes @staticmethod @contextmanager def from_bytes( @@ -143,8 +143,10 @@ class Request: class Input: keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] - data: Sequence[Tensor | TensorBuilder | TensorReader] - def which(self) -> Literal["keys", "data"]: ... + descriptors: Sequence[ + TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader + ] + def which(self) -> Literal["keys", "descriptors"]: ... @staticmethod @contextmanager def from_bytes( @@ -164,12 +166,14 @@ class Request: class InputReader(Request.Input): keys: Sequence[TensorKeyReader] - data: Sequence[TensorReader] + descriptors: Sequence[TensorDescriptorReader] def as_builder(self) -> Request.InputBuilder: ... class InputBuilder(Request.Input): keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] - data: Sequence[Tensor | TensorBuilder | TensorReader] + descriptors: Sequence[ + TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader + ] @staticmethod def from_dict(dictionary: dict) -> Request.InputBuilder: ... def copy(self) -> Request.InputBuilder: ... diff --git a/smartsim/_core/mli/mli_schemas/response/response.capnp b/smartsim/_core/mli/mli_schemas/response/response.capnp index 83aa05a41b..7194524cd0 100644 --- a/smartsim/_core/mli/mli_schemas/response/response.capnp +++ b/smartsim/_core/mli/mli_schemas/response/response.capnp @@ -42,7 +42,7 @@ struct Response { message @1 :Text; result :union { keys @2 :List(DataRef.TensorKey); - data @3 :List(Tensors.Tensor); + descriptors @3 :List(Tensors.TensorDescriptor); } customAttributes :union { torch @4 :ResponseAttributes.TorchResponseAttributes; diff --git a/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi b/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi index f19bdefe04..6b4c50fd05 100644 --- a/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi @@ -35,7 +35,11 @@ from io import BufferedWriter from typing import Iterator, Literal, Sequence, overload from ..data.data_references_capnp import TensorKey, TensorKeyBuilder, TensorKeyReader -from ..tensor.tensor_capnp import Tensor, TensorBuilder, TensorReader +from ..tensor.tensor_capnp import ( + TensorDescriptor, + TensorDescriptorBuilder, + TensorDescriptorReader, +) from .response_attributes.response_attributes_capnp import ( TensorFlowResponseAttributes, TensorFlowResponseAttributesBuilder, @@ -50,8 +54,10 @@ Status = Literal["complete", "fail", "timeout", "running"] class Response: class Result: keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] - data: Sequence[Tensor | TensorBuilder | TensorReader] - def which(self) -> Literal["keys", "data"]: ... + descriptors: Sequence[ + TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader + ] + def which(self) -> Literal["keys", "descriptors"]: ... @staticmethod @contextmanager def from_bytes( @@ -71,12 +77,14 @@ class Response: class ResultReader(Response.Result): keys: Sequence[TensorKeyReader] - data: Sequence[TensorReader] + descriptors: Sequence[TensorDescriptorReader] def as_builder(self) -> Response.ResultBuilder: ... class ResultBuilder(Response.Result): keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] - data: Sequence[Tensor | TensorBuilder | TensorReader] + descriptors: Sequence[ + TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader + ] @staticmethod def from_dict(dictionary: dict) -> Response.ResultBuilder: ... def copy(self) -> Response.ResultBuilder: ... diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp b/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp index aca1ce0836..4b2218b166 100644 --- a/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp +++ b/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp @@ -58,12 +58,7 @@ enum ReturnNumericalType { float32 @8; float64 @9; none @10; - auto @ 11; -} - -struct Tensor { - blob @0 :Data; - tensorDescriptor @1 :TensorDescriptor; + auto @11; } struct TensorDescriptor { diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py index aa7f1e7b18..8c9d6c9029 100644 --- a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py +++ b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py @@ -33,9 +33,6 @@ capnp.remove_import_hook() here = os.path.dirname(os.path.abspath(__file__)) module_file = os.path.abspath(os.path.join(here, "tensor.capnp")) -Tensor = capnp.load(module_file).Tensor -TensorBuilder = Tensor -TensorReader = Tensor TensorDescriptor = capnp.load(module_file).TensorDescriptor TensorDescriptorBuilder = TensorDescriptor TensorDescriptorReader = TensorDescriptor diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi index 7e7222ef54..b55f26b452 100644 --- a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi @@ -101,49 +101,6 @@ class TensorDescriptorBuilder(TensorDescriptor): @staticmethod def write_packed(file: BufferedWriter) -> None: ... -class Tensor: - blob: bytes - tensorDescriptor: ( - TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader - ) - def init(self, name: Literal["tensorDescriptor"]) -> TensorDescriptor: ... - @staticmethod - @contextmanager - def from_bytes( - data: bytes, - traversal_limit_in_words: int | None = ..., - nesting_limit: int | None = ..., - ) -> Iterator[TensorReader]: ... - @staticmethod - def from_bytes_packed( - data: bytes, - traversal_limit_in_words: int | None = ..., - nesting_limit: int | None = ..., - ) -> TensorReader: ... - @staticmethod - def new_message() -> TensorBuilder: ... - def to_dict(self) -> dict: ... - -class TensorReader(Tensor): - tensorDescriptor: TensorDescriptorReader - def as_builder(self) -> TensorBuilder: ... - -class TensorBuilder(Tensor): - tensorDescriptor: ( - TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader - ) - @staticmethod - def from_dict(dictionary: dict) -> TensorBuilder: ... - def copy(self) -> TensorBuilder: ... - def to_bytes(self) -> bytes: ... - def to_bytes_packed(self) -> bytes: ... - def to_segments(self) -> list[bytes]: ... - def as_reader(self) -> TensorReader: ... - @staticmethod - def write(file: BufferedWriter) -> None: ... - @staticmethod - def write_packed(file: BufferedWriter) -> None: ... - class OutputDescriptor: order: Order optionalKeys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader] diff --git a/tests/mli/test_torch_worker.py b/tests/mli/test_torch_worker.py index 0b1cd4ccf3..b73e4a31b5 100644 --- a/tests/mli/test_torch_worker.py +++ b/tests/mli/test_torch_worker.py @@ -95,17 +95,18 @@ def create_torch_model(): def get_request() -> InferenceRequest: tensors = [get_batch() for _ in range(2)] - serialized_tensors = [ - MessageHandler.build_tensor(tensor.numpy(), "c", "float32", list(tensor.shape)) + tensor_numpy = [tensor.numpy() for tensor in tensors] + serialized_tensors_descriptors = [ + MessageHandler.build_tensor_descriptor("c", "float32", list(tensor.shape)) for tensor in tensors ] return InferenceRequest( model_key="model", callback=None, - raw_inputs=[s_tensor.blob for s_tensor in serialized_tensors], + raw_inputs=tensor_numpy, input_keys=None, - input_meta=[s_tensor.tensorDescriptor for s_tensor in serialized_tensors], + input_meta=serialized_tensors_descriptors, output_keys=None, raw_model=create_torch_model(), batch_size=0, @@ -167,7 +168,9 @@ def test_transform_output(mlutils): sample_request, execute_result, torch_device[mlutils.get_test_device().lower()] ) - assert transformed_output.outputs == execute_result.predictions + assert transformed_output.outputs == [ + item.numpy().tobytes() for item in execute_result.predictions + ] assert transformed_output.shape == None assert transformed_output.order == "c" assert transformed_output.dtype == "float32" diff --git a/tests/test_message_handler/test_build_tensor.py b/tests/test_message_handler/test_build_tensor.py deleted file mode 100644 index aa7bd4e6e2..0000000000 --- a/tests/test_message_handler/test_build_tensor.py +++ /dev/null @@ -1,185 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2024, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -try: - import tensorflow as tf -except ImportError: - should_run_tf = False -else: - should_run_tf = True - - small_tf_tensor = tf.zeros((3, 2, 5), dtype=tf.int8) - small_tf_tensor = small_tf_tensor.numpy() - medium_tf_tensor = tf.ones((1040, 1040, 3), dtype=tf.int64) - medium_tf_tensor = medium_tf_tensor.numpy() - - -try: - import torch -except ImportError: - should_run_torch = False -else: - should_run_torch = True - - small_torch_tensor = torch.zeros((3, 2, 5), dtype=torch.int8) - small_torch_tensor = small_torch_tensor.numpy() - medium_torch_tensor = torch.ones((1040, 1040, 3), dtype=torch.int64) - medium_torch_tensor = medium_torch_tensor.numpy() - -from smartsim._core.mli.message_handler import MessageHandler - -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_a - -handler = MessageHandler() - - -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") -@pytest.mark.parametrize( - "tensor, dtype, order, dimension", - [ - pytest.param( - small_torch_tensor, - "int8", - "c", - [3, 2, 5], - id="small torch tensor", - ), - pytest.param( - medium_torch_tensor, - "int64", - "c", - [1040, 1040, 3], - id="medium torch tensor", - ), - ], -) -def test_build_torch_tensor_successful(tensor, dtype, order, dimension): - built_tensor = handler.build_tensor(tensor, order, dtype, dimension) - assert built_tensor is not None - assert type(built_tensor.blob) == bytes - assert built_tensor.tensorDescriptor.order == order - assert built_tensor.tensorDescriptor.dataType == dtype - for i, j in zip(built_tensor.tensorDescriptor.dimensions, dimension): - assert i == j - - -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -@pytest.mark.parametrize( - "tensor, dtype, order, dimension", - [ - pytest.param( - small_tf_tensor, - "int8", - "c", - [3, 2, 5], - id="small tf tensor", - ), - pytest.param( - medium_tf_tensor, - "int64", - "c", - [1040, 1040, 3], - id="medium tf tensor", - ), - ], -) -def test_build_tf_tensor_successful(tensor, dtype, order, dimension): - built_tensor = handler.build_tensor(tensor, order, dtype, dimension) - assert built_tensor is not None - assert type(built_tensor.blob) == bytes - assert built_tensor.tensorDescriptor.order == order - assert built_tensor.tensorDescriptor.dataType == dtype - for i, j in zip(built_tensor.tensorDescriptor.dimensions, dimension): - assert i == j - - -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") -@pytest.mark.parametrize( - "tensor, dtype, order, dimension", - [ - pytest.param([1, 2, 4], "c", "int8", [1, 2, 3], id="bad tensor type"), - pytest.param( - small_torch_tensor, - "bad_order", - "int8", - [3, 2, 5], - id="bad order type", - ), - pytest.param( - small_torch_tensor, - "f", - "bad_num_type", - [3, 2, 5], - id="bad numerical type", - ), - pytest.param( - small_torch_tensor, - "f", - "int8", - "bad shape type", - id="bad shape type", - ), - ], -) -def test_build_torch_tensor_bad_input(tensor, dtype, order, dimension): - with pytest.raises(ValueError): - built_tensor = handler.build_tensor(tensor, order, dtype, dimension) - - -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -@pytest.mark.parametrize( - "tensor, dtype, order, dimension", - [ - pytest.param([1, 2, 4], "c", "int8", [1, 2, 3], id="bad tensor type"), - pytest.param( - small_tf_tensor, - "bad_order", - "int8", - [3, 2, 5], - id="bad order type", - ), - pytest.param( - small_tf_tensor, - "f", - "bad_num_type", - [3, 2, 5], - id="bad numerical type", - ), - pytest.param( - small_tf_tensor, - "f", - "int8", - "bad shape type", - id="bad shape type", - ), - ], -) -def test_build_tf_tensor_bad_input(tensor, dtype, order, dimension): - with pytest.raises(ValueError): - built_tensor = handler.build_tensor(tensor, order, dtype, dimension) diff --git a/tests/test_message_handler/test_build_tensor_desc.py b/tests/test_message_handler/test_build_tensor_desc.py new file mode 100644 index 0000000000..45126fb16c --- /dev/null +++ b/tests/test_message_handler/test_build_tensor_desc.py @@ -0,0 +1,90 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from smartsim._core.mli.message_handler import MessageHandler + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +handler = MessageHandler() + + +@pytest.mark.parametrize( + "dtype, order, dimension", + [ + pytest.param( + "int8", + "c", + [3, 2, 5], + id="small torch tensor", + ), + pytest.param( + "int64", + "c", + [1040, 1040, 3], + id="medium torch tensor", + ), + ], +) +def test_build_tensor_descriptor_successful(dtype, order, dimension): + built_tensor_descriptor = handler.build_tensor_descriptor(order, dtype, dimension) + assert built_tensor_descriptor is not None + assert built_tensor_descriptor.order == order + assert built_tensor_descriptor.dataType == dtype + for i, j in zip(built_tensor_descriptor.dimensions, dimension): + assert i == j + + +@pytest.mark.parametrize( + "dtype, order, dimension", + [ + pytest.param( + "bad_order", + "int8", + [3, 2, 5], + id="bad order type", + ), + pytest.param( + "f", + "bad_num_type", + [3, 2, 5], + id="bad numerical type", + ), + pytest.param( + "f", + "int8", + "bad shape type", + id="bad shape type", + ), + ], +) +def test_build_tensor_descriptor_unsuccessful(dtype, order, dimension): + with pytest.raises(ValueError): + built_tensor_descriptor = handler.build_tensor_descriptor( + order, dtype, dimension + ) diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py index b1fedaa024..4cfc115845 100644 --- a/tests/test_message_handler/test_request.py +++ b/tests/test_message_handler/test_request.py @@ -28,46 +28,6 @@ from smartsim._core.mli.message_handler import MessageHandler -try: - import tensorflow as tf -except ImportError: - should_run_tf = False -else: - should_run_tf = True - tflow1 = tf.zeros((3, 2, 5), dtype=tf.int8) - tflow2 = tf.ones((10, 10, 3), dtype=tf.int64) - - tensor_3 = MessageHandler.build_tensor( - tflow1.numpy(), "c", "int8", list(tflow1.shape) - ) - tensor_4 = MessageHandler.build_tensor( - tflow2.numpy(), "c", "int64", list(tflow2.shape) - ) - - tf_attributes = MessageHandler.build_tf_request_attributes( - name="tf", tensor_type="sparse" - ) - - -try: - import torch -except ImportError: - should_run_torch = False -else: - should_run_torch = True - - torch1 = torch.zeros((3, 2, 5), dtype=torch.int8) - torch2 = torch.ones((10, 10, 3), dtype=torch.int64) - - tensor_1 = MessageHandler.build_tensor( - torch1.numpy(), "c", "int8", list(torch1.shape) - ) - tensor_2 = MessageHandler.build_tensor( - torch2.numpy(), "c", "int64", list(torch2.shape) - ) - - torch_attributes = MessageHandler.build_torch_request_attributes("sparse") - # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -87,123 +47,54 @@ output_descriptor3 = MessageHandler.build_output_tensor_descriptor( "c", [output_key1], "none", [1, 2, 3] ) +torch_attributes = MessageHandler.build_torch_request_attributes("sparse") +tf_attributes = MessageHandler.build_tf_request_attributes( + name="tf", tensor_type="sparse" +) +tensor_1 = MessageHandler.build_tensor_descriptor("c", "int8", [1]) +tensor_2 = MessageHandler.build_tensor_descriptor("c", "int64", [3, 2]) +tensor_3 = MessageHandler.build_tensor_descriptor("f", "int8", [1]) +tensor_4 = MessageHandler.build_tensor_descriptor("f", "int64", [3, 2]) -if should_run_tf: - tf_indirect_request = MessageHandler.build_request( - b"reply", - model, - [input_key1, input_key2], - [output_key1, output_key2], - [output_descriptor1, output_descriptor2, output_descriptor3], - tf_attributes, - ) - tf_direct_request = MessageHandler.build_request( - b"reply", - model, - [tensor_3, tensor_4], - [], - [output_descriptor1, output_descriptor2], - tf_attributes, - ) +tf_indirect_request = MessageHandler.build_request( + b"reply", + model, + [input_key1, input_key2], + [output_key1, output_key2], + [output_descriptor1, output_descriptor2, output_descriptor3], + tf_attributes, +) -if should_run_torch: - torch_indirect_request = MessageHandler.build_request( - b"reply", - model, - [input_key1, input_key2], - [output_key1, output_key2], - [output_descriptor1, output_descriptor2, output_descriptor3], - torch_attributes, - ) - torch_direct_request = MessageHandler.build_request( - b"reply", - model, - [tensor_1, tensor_2], - [], - [output_descriptor1, output_descriptor2], - torch_attributes, - ) +tf_direct_request = MessageHandler.build_request( + b"reply", + model, + [tensor_3, tensor_4], + [], + [output_descriptor1, output_descriptor2], + tf_attributes, +) +torch_indirect_request = MessageHandler.build_request( + b"reply", + model, + [input_key1, input_key2], + [output_key1, output_key2], + [output_descriptor1, output_descriptor2, output_descriptor3], + torch_attributes, +) -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -@pytest.mark.parametrize( - "reply_channel, model, input, output, output_descriptors, custom_attributes", - [ - pytest.param( - b"reply channel", - model_key, - [input_key1, input_key2], - [output_key1, output_key2], - [output_descriptor1], - tf_attributes, - ), - pytest.param( - b"another reply channel", - model, - [input_key1], - [output_key2], - [output_descriptor1], - tf_attributes, - ), - pytest.param( - b"another reply channel", - model, - [input_key1], - [output_key2], - [output_descriptor1], - tf_attributes, - ), - pytest.param( - b"reply channel", - model_key, - [input_key1], - [output_key1], - [output_descriptor1], - None, - ), - ], +torch_direct_request = MessageHandler.build_request( + b"reply", + model, + [tensor_1, tensor_2], + [], + [output_descriptor1, output_descriptor2], + torch_attributes, ) -def test_build_request_indirect_tf_successful( - reply_channel, model, input, output, output_descriptors, custom_attributes -): - built_request = MessageHandler.build_request( - reply_channel, - model, - input, - output, - output_descriptors, - custom_attributes, - ) - assert built_request is not None - assert built_request.replyChannel.reply == reply_channel - if built_request.model.which() == "key": - assert built_request.model.key.key == model.key - else: - assert built_request.model.data.data == model.data - assert built_request.model.data.name == model.name - assert built_request.model.data.version == model.version - assert built_request.input.which() == "keys" - assert built_request.input.keys[0].key == input[0].key - assert len(built_request.input.keys) == len(input) - assert len(built_request.output) == len(output) - for i, j in zip(built_request.outputDescriptors, output_descriptors): - assert i.order == j.order - if built_request.customAttributes.which() == "tf": - assert ( - built_request.customAttributes.tf.tensorType == custom_attributes.tensorType - ) - elif built_request.customAttributes.which() == "torch": - assert ( - built_request.customAttributes.torch.tensorType - == custom_attributes.tensorType - ) - else: - assert built_request.customAttributes.none == custom_attributes -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") @pytest.mark.parametrize( "reply_channel, model, input, output, output_descriptors, custom_attributes", [ @@ -221,7 +112,7 @@ def test_build_request_indirect_tf_successful( [input_key1], [output_key2], [output_descriptor1], - torch_attributes, + tf_attributes, ), pytest.param( b"another reply channel", @@ -241,7 +132,7 @@ def test_build_request_indirect_tf_successful( ), ], ) -def test_build_request_indirect_torch_successful( +def test_build_request_indirect_successful( reply_channel, model, input, output, output_descriptors, custom_attributes ): built_request = MessageHandler.build_request( @@ -253,7 +144,7 @@ def test_build_request_indirect_torch_successful( custom_attributes, ) assert built_request is not None - assert built_request.replyChannel.reply == reply_channel + assert built_request.replyChannel.descriptor == reply_channel if built_request.model.which() == "key": assert built_request.model.key.key == model.key else: @@ -279,108 +170,6 @@ def test_build_request_indirect_torch_successful( assert built_request.customAttributes.none == custom_attributes -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") -@pytest.mark.parametrize( - "reply_channel, model, input, output, output_descriptors, custom_attributes", - [ - pytest.param( - [], - model_key, - [input_key1, input_key2], - [output_key1, output_key2], - [output_descriptor1], - torch_attributes, - id="bad channel", - ), - pytest.param( - b"reply channel", - "bad model", - [input_key1], - [output_key2], - [output_descriptor1], - torch_attributes, - id="bad model", - ), - pytest.param( - b"reply channel", - model_key, - ["input_key1", "input_key2"], - [output_key1, output_key2], - [output_descriptor1], - torch_attributes, - id="bad inputs", - ), - pytest.param( - b"reply channel", - model_key, - [model_key], - [output_key1, output_key2], - [output_descriptor1], - torch_attributes, - id="bad input schema type", - ), - pytest.param( - b"reply channel", - model_key, - [input_key1], - ["output_key1", "output_key2"], - [output_descriptor1], - torch_attributes, - id="bad outputs", - ), - pytest.param( - b"reply channel", - model_key, - [input_key1], - [model_key], - [output_descriptor1], - torch_attributes, - id="bad output schema type", - ), - pytest.param( - b"reply channel", - model_key, - [input_key1], - [output_key1, output_key2], - [output_descriptor1], - "bad attributes", - id="bad custom attributes", - ), - pytest.param( - b"reply channel", - model_key, - [input_key1], - [output_key1, output_key2], - [output_descriptor1], - model_key, - id="bad custom attributes schema type", - ), - pytest.param( - b"reply channel", - model_key, - [input_key1], - [output_key1, output_key2], - "bad descriptors", - torch_attributes, - id="bad output descriptors", - ), - ], -) -def test_build_request_indirect_torch_unsuccessful( - reply_channel, model, input, output, output_descriptors, custom_attributes -): - with pytest.raises(ValueError): - built_request = MessageHandler.build_request( - reply_channel, - model, - input, - output, - output_descriptors, - custom_attributes, - ) - - -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") @pytest.mark.parametrize( "reply_channel, model, input, output, output_descriptors, custom_attributes", [ @@ -399,7 +188,7 @@ def test_build_request_indirect_torch_unsuccessful( [input_key1], [output_key2], [output_descriptor1], - tf_attributes, + torch_attributes, id="bad model", ), pytest.param( @@ -417,7 +206,7 @@ def test_build_request_indirect_torch_unsuccessful( [model_key], [output_key1, output_key2], [output_descriptor1], - tf_attributes, + torch_attributes, id="bad input schema type", ), pytest.param( @@ -462,12 +251,12 @@ def test_build_request_indirect_torch_unsuccessful( [input_key1], [output_key1, output_key2], "bad descriptors", - tf_attributes, + torch_attributes, id="bad output descriptors", ), ], ) -def test_build_request_indirect_tf_unsuccessful( +def test_build_request_indirect_unsuccessful( reply_channel, model, input, output, output_descriptors, custom_attributes ): with pytest.raises(ValueError): @@ -481,7 +270,6 @@ def test_build_request_indirect_tf_unsuccessful( ) -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") @pytest.mark.parametrize( "reply_channel, model, input, output, output_descriptors, custom_attributes", [ @@ -499,88 +287,12 @@ def test_build_request_indirect_tf_unsuccessful( [tensor_1], [], [output_descriptor3], - torch_attributes, - ), - pytest.param( - b"another reply channel", - model, - [tensor_2], - [], - [output_descriptor1], - torch_attributes, - ), - pytest.param( - b"another reply channel", - model, - [tensor_1], - [], - [output_descriptor1], - None, - ), - ], -) -def test_build_request_direct_torch_successful( - reply_channel, model, input, output, output_descriptors, custom_attributes -): - built_request = MessageHandler.build_request( - reply_channel, - model, - input, - output, - output_descriptors, - custom_attributes, - ) - assert built_request is not None - assert built_request.replyChannel.reply == reply_channel - if built_request.model.which() == "key": - assert built_request.model.key.key == model.key - else: - assert built_request.model.data.data == model.data - assert built_request.model.data.name == model.name - assert built_request.model.data.version == model.version - assert built_request.input.which() == "data" - assert built_request.input.data[0].blob == input[0].blob - assert len(built_request.input.data) == len(input) - assert len(built_request.output) == len(output) - for i, j in zip(built_request.outputDescriptors, output_descriptors): - assert i.order == j.order - if built_request.customAttributes.which() == "tf": - assert ( - built_request.customAttributes.tf.tensorType == custom_attributes.tensorType - ) - elif built_request.customAttributes.which() == "torch": - assert ( - built_request.customAttributes.torch.tensorType - == custom_attributes.tensorType - ) - else: - assert built_request.customAttributes.none == custom_attributes - - -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -@pytest.mark.parametrize( - "reply_channel, model, input, output, output_descriptors, custom_attributes", - [ - pytest.param( - b"reply channel", - model_key, - [tensor_3, tensor_4], - [], - [output_descriptor2], tf_attributes, ), pytest.param( b"another reply channel", model, - [tensor_4], - [], - [output_descriptor3], - tf_attributes, - ), - pytest.param( - b"another reply channel", - model, - [tensor_4], + [tensor_2], [], [output_descriptor1], tf_attributes, @@ -588,14 +300,14 @@ def test_build_request_direct_torch_successful( pytest.param( b"another reply channel", model, - [tensor_3], + [tensor_1], [], [output_descriptor1], None, ), ], ) -def test_build_request_direct_tf_successful( +def test_build_request_direct_successful( reply_channel, model, input, output, output_descriptors, custom_attributes ): built_request = MessageHandler.build_request( @@ -607,16 +319,15 @@ def test_build_request_direct_tf_successful( custom_attributes, ) assert built_request is not None - assert built_request.replyChannel.reply == reply_channel + assert built_request.replyChannel.descriptor == reply_channel if built_request.model.which() == "key": assert built_request.model.key.key == model.key else: assert built_request.model.data.data == model.data assert built_request.model.data.name == model.name assert built_request.model.data.version == model.version - assert built_request.input.which() == "data" - assert built_request.input.data[0].blob == input[0].blob - assert len(built_request.input.data) == len(input) + assert built_request.input.which() == "descriptors" + assert len(built_request.input.descriptors) == len(input) assert len(built_request.output) == len(output) for i, j in zip(built_request.outputDescriptors, output_descriptors): assert i.order == j.order @@ -633,81 +344,6 @@ def test_build_request_direct_tf_successful( assert built_request.customAttributes.none == custom_attributes -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") -@pytest.mark.parametrize( - "reply_channel, model, input, output, output_descriptors, custom_attributes", - [ - pytest.param( - [], - model_key, - [tensor_1, tensor_2], - [], - [output_descriptor2], - torch_attributes, - id="bad channel", - ), - pytest.param( - b"reply channel", - "bad model", - [tensor_1], - [], - [output_descriptor2], - torch_attributes, - id="bad model", - ), - pytest.param( - b"reply channel", - model_key, - ["input_key1", "input_key2"], - [], - [output_descriptor2], - torch_attributes, - id="bad inputs", - ), - pytest.param( - b"reply channel", - model_key, - [], - ["output_key1", "output_key2"], - [output_descriptor2], - torch_attributes, - id="bad outputs", - ), - pytest.param( - b"reply channel", - model_key, - [tensor_1], - [], - [output_descriptor2], - "bad attributes", - id="bad custom attributes", - ), - pytest.param( - b"reply_channel", - model_key, - [tensor_1, tensor_2], - [], - ["output_descriptor2"], - torch_attributes, - id="bad output descriptors", - ), - ], -) -def test_build_torch_request_direct_unsuccessful( - reply_channel, model, input, output, output_descriptors, custom_attributes -): - with pytest.raises(ValueError): - built_request = MessageHandler.build_request( - reply_channel, - model, - input, - output, - output_descriptors, - custom_attributes, - ) - - -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") @pytest.mark.parametrize( "reply_channel, model, input, output, output_descriptors, custom_attributes", [ @@ -735,7 +371,7 @@ def test_build_torch_request_direct_unsuccessful( ["input_key1", "input_key2"], [], [output_descriptor2], - tf_attributes, + torch_attributes, id="bad inputs", ), pytest.param( @@ -762,12 +398,12 @@ def test_build_torch_request_direct_unsuccessful( [tensor_3, tensor_4], [], ["output_descriptor2"], - tf_attributes, + torch_attributes, id="bad output descriptors", ), ], ) -def test_build_tf_request_direct_unsuccessful( +def test_build_request_direct_unsuccessful( reply_channel, model, input, output, output_descriptors, custom_attributes ): with pytest.raises(ValueError): @@ -781,31 +417,16 @@ def test_build_tf_request_direct_unsuccessful( ) -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") @pytest.mark.parametrize( "req", [ + pytest.param(tf_indirect_request, id="tf indirect"), + pytest.param(tf_direct_request, id="tf direct"), pytest.param(torch_indirect_request, id="indirect"), pytest.param(torch_direct_request, id="direct"), ], ) -def test_serialize_torch_request_successful(req): - serialized = MessageHandler.serialize_request(req) - assert type(serialized) == bytes - - deserialized = MessageHandler.deserialize_request(serialized) - assert deserialized.to_dict() == req.to_dict() - - -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -@pytest.mark.parametrize( - "req", - [ - pytest.param(tf_indirect_request, id="indirect"), - pytest.param(tf_direct_request, id="direct"), - ], -) -def test_serialize_tf_request_successful(req): +def test_serialize_request_successful(req): serialized = MessageHandler.serialize_request(req) assert type(serialized) == bytes diff --git a/tests/test_message_handler/test_response.py b/tests/test_message_handler/test_response.py index 9d59a18793..03bd9ba73f 100644 --- a/tests/test_message_handler/test_response.py +++ b/tests/test_message_handler/test_response.py @@ -28,60 +28,6 @@ from smartsim._core.mli.message_handler import MessageHandler -try: - import tensorflow as tf -except ImportError: - should_run_tf = False -else: - should_run_tf = True - - tflow1 = tf.zeros((3, 2, 5), dtype=tf.int8) - tflow2 = tf.ones((1040, 1040, 3), dtype=tf.int64) - - small_tf_tensor = MessageHandler.build_tensor( - tflow1.numpy(), "c", "int8", list(tflow1.shape) - ) - medium_tf_tensor = MessageHandler.build_tensor( - tflow2.numpy(), "c", "int64", list(tflow2.shape) - ) - - tf_attributes = MessageHandler.build_tf_response_attributes() - - tf_direct_response = MessageHandler.build_response( - "complete", - "Success again!", - [small_tf_tensor, medium_tf_tensor], - tf_attributes, - ) - - -try: - import torch -except ImportError: - should_run_torch = False -else: - should_run_torch = True - - torch1 = torch.zeros((3, 2, 5), dtype=torch.int8) - torch2 = torch.ones((1040, 1040, 3), dtype=torch.int64) - - small_torch_tensor = MessageHandler.build_tensor( - torch1.numpy(), "c", "int8", list(torch1.shape) - ) - medium_torch_tensor = MessageHandler.build_tensor( - torch2.numpy(), "c", "int64", list(torch2.shape) - ) - - torch_attributes = MessageHandler.build_torch_response_attributes() - - torch_direct_response = MessageHandler.build_response( - "complete", - "Success again!", - [small_torch_tensor, medium_torch_tensor], - torch_attributes, - ) - - # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -89,86 +35,51 @@ result_key1 = MessageHandler.build_tensor_key("result_key1") result_key2 = MessageHandler.build_tensor_key("result_key2") +torch_attributes = MessageHandler.build_torch_response_attributes() +tf_attributes = MessageHandler.build_tf_response_attributes() -if should_run_tf: - tf_indirect_response = MessageHandler.build_response( - "complete", - "Success!", - [result_key1, result_key2], - tf_attributes, - ) +tensor1 = MessageHandler.build_tensor_descriptor("c", "int8", [1]) +tensor2 = MessageHandler.build_tensor_descriptor("c", "int64", [3, 2]) -if should_run_torch: - torch_indirect_response = MessageHandler.build_response( - "complete", - "Success!", - [result_key1, result_key2], - torch_attributes, - ) +tf_indirect_response = MessageHandler.build_response( + "complete", + "Success!", + [result_key1, result_key2], + tf_attributes, +) -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") -@pytest.mark.parametrize( - "status, status_message, result, custom_attribute", - [ - pytest.param( - 200, - "Yay, it worked!", - [small_torch_tensor, medium_torch_tensor], - None, - id="tensor list", - ), - pytest.param( - 200, - "Yay, it worked!", - [small_torch_tensor], - torch_attributes, - id="small tensor", - ), - pytest.param( - 200, - "Yay, it worked!", - [result_key1, result_key2], - torch_attributes, - id="tensor key list", - ), - ], +tf_direct_response = MessageHandler.build_response( + "complete", + "Success again!", + [tensor2, tensor1], + tf_attributes, +) + +torch_indirect_response = MessageHandler.build_response( + "complete", + "Success!", + [result_key1, result_key2], + torch_attributes, +) + +torch_direct_response = MessageHandler.build_response( + "complete", + "Success again!", + [tensor1, tensor2], + torch_attributes, ) -def test_build_torch_response_successful( - status, status_message, result, custom_attribute -): - response = MessageHandler.build_response( - status=status, - message=status_message, - result=result, - custom_attributes=custom_attribute, - ) - assert response is not None - assert response.status == status - assert response.message == status_message - if response.result.which() == "keys": - assert response.result.keys[0].to_dict() == result[0].to_dict() - else: - assert response.result.data[0].to_dict() == result[0].to_dict() -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") @pytest.mark.parametrize( "status, status_message, result, custom_attribute", [ pytest.param( 200, "Yay, it worked!", - [small_tf_tensor, medium_tf_tensor], + [tensor1, tensor2], None, - id="tensor list", - ), - pytest.param( - 200, - "Yay, it worked!", - [small_tf_tensor], - tf_attributes, - id="small tensor", + id="tensor descriptor list", ), pytest.param( 200, @@ -179,7 +90,7 @@ def test_build_torch_response_successful( ), ], ) -def test_build_tf_response_successful(status, status_message, result, custom_attribute): +def test_build_response_successful(status, status_message, result, custom_attribute): response = MessageHandler.build_response( status=status, message=status_message, @@ -192,25 +103,24 @@ def test_build_tf_response_successful(status, status_message, result, custom_att if response.result.which() == "keys": assert response.result.keys[0].to_dict() == result[0].to_dict() else: - assert response.result.data[0].to_dict() == result[0].to_dict() + assert response.result.descriptors[0].to_dict() == result[0].to_dict() -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") @pytest.mark.parametrize( "status, status_message, result, custom_attribute", [ pytest.param( "bad status", "Yay, it worked!", - [small_tf_tensor, medium_tf_tensor], + [tensor1, tensor2], None, id="bad status", ), pytest.param( "complete", 200, - [small_tf_tensor], - tf_attributes, + [tensor2], + torch_attributes, id="bad status message", ), pytest.param( @@ -230,110 +140,36 @@ def test_build_tf_response_successful(status, status_message, result, custom_att pytest.param( "complete", "Yay, it worked!", - [small_tf_tensor, medium_tf_tensor], - "custom attributes", - id="bad custom attributes", - ), - pytest.param( - "complete", - "Yay, it worked!", - [small_tf_tensor, medium_tf_tensor], - result_key1, - id="bad custom attributes type", - ), - ], -) -def test_build_tf_response_unsuccessful( - status, status_message, result, custom_attribute -): - with pytest.raises(ValueError): - response = MessageHandler.build_response( - status, status_message, result, custom_attribute - ) - - -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") -@pytest.mark.parametrize( - "status, status_message, result, custom_attribute", - [ - pytest.param( - "bad status", - "Yay, it worked!", - [small_torch_tensor, medium_torch_tensor], - None, - id="bad status", - ), - pytest.param( - "complete", - 200, - [small_torch_tensor], - torch_attributes, - id="bad status message", - ), - pytest.param( - "complete", - "Yay, it worked!", - ["result_key1", "result_key2"], - torch_attributes, - id="bad result", - ), - pytest.param( - "complete", - "Yay, it worked!", - [torch_attributes], - torch_attributes, - id="bad result type", - ), - pytest.param( - "complete", - "Yay, it worked!", - [small_torch_tensor, medium_torch_tensor], + [tensor2, tensor1], "custom attributes", id="bad custom attributes", ), pytest.param( "complete", "Yay, it worked!", - [small_torch_tensor, medium_torch_tensor], + [tensor2, tensor1], result_key1, id="bad custom attributes type", ), ], ) -def test_build_torch_response_unsuccessful( - status, status_message, result, custom_attribute -): +def test_build_response_unsuccessful(status, status_message, result, custom_attribute): with pytest.raises(ValueError): response = MessageHandler.build_response( status, status_message, result, custom_attribute ) -@pytest.mark.skipif(not should_run_torch, reason="Test needs Torch to run") @pytest.mark.parametrize( "response", [ pytest.param(torch_indirect_response, id="indirect"), pytest.param(torch_direct_response, id="direct"), + pytest.param(tf_indirect_response, id="tf indirect"), + pytest.param(tf_direct_response, id="tf direct"), ], ) -def test_torch_serialize_response(response): - serialized = MessageHandler.serialize_response(response) - assert type(serialized) == bytes - - deserialized = MessageHandler.deserialize_response(serialized) - assert deserialized.to_dict() == response.to_dict() - - -@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -@pytest.mark.parametrize( - "response", - [ - pytest.param(tf_indirect_response, id="indirect"), - pytest.param(tf_direct_response, id="direct"), - ], -) -def test_tf_serialize_response(response): +def test_serialize_response(response): serialized = MessageHandler.serialize_response(response) assert type(serialized) == bytes From 84101b359a327d2b892898ee92d3b10959bcf5df Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 19 Jul 2024 15:50:30 +0200 Subject: [PATCH 12/60] New develop merger (#645) Bring mli-feature up to date with develop. --------- [ committed by @al-rigazzi ] [ reviewed by @AlyssaCote @ankona ] --- .readthedocs.yaml | 6 +- doc/changelog.md | 10 ++++ doc/requirements-doc.txt | 18 ------ docker/docs/dev/Dockerfile | 3 +- setup.py | 56 ++++++++++++------- smartsim/_core/_install/buildenv.py | 4 +- .../_core/launcher/dragon/dragonBackend.py | 14 +++-- 7 files changed, 59 insertions(+), 52 deletions(-) delete mode 100644 doc/requirements-doc.txt diff --git a/.readthedocs.yaml b/.readthedocs.yaml index cecdfe3bf3..88f270ba78 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -23,7 +23,7 @@ build: - git clone --depth 1 https://github.com/CrayLabs/SmartRedis.git smartredis - git clone --depth 1 https://github.com/CrayLabs/SmartDashboard.git smartdashboard post_create_environment: - - python -m pip install .[dev] + - python -m pip install .[dev,docs] - cd smartredis; python -m pip install . - cd smartredis/doc; doxygen Doxyfile_c; doxygen Doxyfile_cpp; doxygen Doxyfile_fortran - ln -s smartredis/examples ./examples @@ -37,7 +37,3 @@ build: sphinx: configuration: doc/conf.py fail_on_warning: true - -python: - install: - - requirements: doc/requirements-doc.txt \ No newline at end of file diff --git a/doc/changelog.md b/doc/changelog.md index 1c91705add..1deed9dfd7 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -30,6 +30,8 @@ To be released at some future point in time Description +- Fix internal host name representation for Dragon backend +- Make dependencies more discoverable in setup.py - Add hardware pinning capability when using dragon - Pin NumPy version to 1.x - New launcher support for SGE (and similar derivatives) @@ -40,6 +42,14 @@ Description Detailed Notes +- setup.py used to define dependencies in a way that was not amenable + to code scanning tools. Direct dependencies now appear directly + in the setup call and the definition of the SmartRedis version + has been removed + ([SmartSim-PR635](https://github.com/CrayLabs/SmartSim/pull/635)) +- The separate definition of dependencies for the docs in + requirements-doc.txt is now defined as an extra. + ([SmartSim-PR635](https://github.com/CrayLabs/SmartSim/pull/635)) - The new major version release of Numpy is incompatible with modules compiled against Numpy 1.x. For both SmartSim and SmartRedis we request a 1.x version of numpy. This is needed in SmartSim because diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt deleted file mode 100644 index 696881bef3..0000000000 --- a/doc/requirements-doc.txt +++ /dev/null @@ -1,18 +0,0 @@ -Sphinx==6.2.1 -breathe==4.35.0 -sphinx-fortran==1.1.1 -sphinx-book-theme==1.0.1 -sphinx-copybutton==0.5.2 -sphinx-tabs==3.4.4 -nbsphinx==0.9.3 -docutils==0.18.1 -torch==2.0.1 -tensorflow==2.13.1 -ipython -jinja2==3.1.2 -protobuf -numpy -sphinx-design -pypandoc -sphinx-autodoc-typehints -myst_parser diff --git a/docker/docs/dev/Dockerfile b/docker/docs/dev/Dockerfile index e9db9c342b..dbac524bce 100644 --- a/docker/docs/dev/Dockerfile +++ b/docker/docs/dev/Dockerfile @@ -55,8 +55,7 @@ RUN git clone https://github.com/CrayLabs/SmartDashboard.git --branch develop -- && rm -rf ~/.cache/pip # Install docs dependencies and SmartSim -RUN python -m pip install -r doc/requirements-doc.txt \ - && NO_CHECKS=1 SMARTSIM_SUFFIX=dev python -m pip install . +RUN NO_CHECKS=1 SMARTSIM_SUFFIX=dev python -m pip install .[docs] # Note this is needed to ensure that the Sphinx builds. Can be removed with newer Tensorflow RUN python -m pip install typing_extensions==4.6.1 diff --git a/setup.py b/setup.py index b4ad7a6eba..a05ac4174c 100644 --- a/setup.py +++ b/setup.py @@ -165,26 +165,9 @@ def has_ext_modules(_placeholder): # Define needed dependencies for the installation -deps = [ - "packaging>=24.0", - "psutil>=5.7.2", - "coloredlogs>=10.0", - "tabulate>=0.8.9", - "redis>=4.5", - "tqdm>=4.50.2", - "filelock>=3.4.2", - "protobuf~=3.20", - "jinja2>=3.1.2", - "watchdog>=4.0.0", - "pycapnp==2.0.0", - "pydantic==1.10.14", - "pyzmq>=25.1.2", - "pygithub>=2.3.0", - "numpy<2" -] # Add SmartRedis at specific version -deps.append("smartredis>={}".format(versions.SMARTREDIS)) +# install_requires.append("smartredis>={}".format(versions.SMARTREDIS)) extras_require = { "dev": [ @@ -206,6 +189,24 @@ def has_ext_modules(_placeholder): "types-setuptools", "typing_extensions>=4.1.0", ], + "docs": [ + "Sphinx==6.2.1", + "breathe==4.35.0", + "sphinx-fortran==1.1.1", + "sphinx-book-theme==1.0.1", + "sphinx-copybutton==0.5.2", + "sphinx-tabs==3.4.4", + "nbsphinx==0.9.3", + "docutils==0.18.1", + "torch==2.0.1", + "tensorflow==2.13.1", + "ipython", + "jinja2==3.1.2", + "sphinx-design", + "pypandoc", + "sphinx-autodoc-typehints", + "myst_parser", + ], # see smartsim/_core/_install/buildenv.py for more details **versions.ml_extras_required(), } @@ -214,7 +215,24 @@ def has_ext_modules(_placeholder): # rest in setup.cfg setup( version=smartsim_version, - install_requires=deps, + install_requires=[ + "packaging>=24.0", + "psutil>=5.7.2", + "coloredlogs>=10.0", + "tabulate>=0.8.9", + "redis>=4.5", + "tqdm>=4.50.2", + "filelock>=3.4.2", + "protobuf~=3.20", + "jinja2>=3.1.2", + "watchdog>=4.0.0", + "pycapnp==2.0.0", + "pydantic==1.10.14", + "pyzmq>=25.1.2", + "pygithub>=2.3.0", + "numpy<2", + "smartredis>=0.5,<0.6", + ], cmdclass={ "build_py": SmartSimBuild, "install": InstallPlatlib, diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py index edb1ff116e..a066ab16ac 100644 --- a/smartsim/_core/_install/buildenv.py +++ b/smartsim/_core/_install/buildenv.py @@ -242,7 +242,7 @@ class Versioner: ``smart build`` command to determine which dependency versions to look for and download. - Default versions for SmartSim, SmartRedis, Redis, and RedisAI are + Default versions for SmartSim, Redis, and RedisAI are all set here. Setting a default version for RedisAI also dictates default versions of the machine learning libraries. """ @@ -252,7 +252,6 @@ class Versioner: # Versions SMARTSIM = Version_(get_env("SMARTSIM_VERSION", "0.7.0")) - SMARTREDIS = Version_(get_env("SMARTREDIS_VERSION", "0.5.3")) SMARTSIM_SUFFIX = get_env("SMARTSIM_SUFFIX", "") # Redis @@ -284,7 +283,6 @@ class Versioner: def as_dict(self, db_name: DbEngine = "REDIS") -> t.Dict[str, t.Tuple[str, ...]]: pkg_map = { "SMARTSIM": self.SMARTSIM, - "SMARTREDIS": self.SMARTREDIS, db_name: self.REDIS, "REDISAI": self.REDISAI, "TORCH": self.TORCH, diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 2938746361..445538f20e 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -213,10 +213,10 @@ def group_infos(self) -> dict[str, ProcessGroupInfo]: def _initialize_hosts(self) -> None: with self._queue_lock: - self._hosts: t.List[str] = sorted( - node for node in dragon_machine.System().nodes - ) - self._nodes = [dragon_machine.Node(node) for node in self._hosts] + self._nodes = [ + dragon_machine.Node(node) for node in dragon_machine.System().nodes + ] + self._hosts: t.List[str] = sorted(node.hostname for node in self._nodes) self._cpus = [node.num_cpus for node in self._nodes] self._gpus = [node.num_gpus for node in self._nodes] @@ -471,7 +471,11 @@ def create_run_policy( if run_request.policy.gpu_affinity: affinity = dragon_policy.Policy.Affinity.SPECIFIC gpu_affinity = run_request.policy.gpu_affinity - + logger.debug( + f"Affinity strategy: {affinity}, " + f"CPU affinity mask: {cpu_affinity}, " + f"GPU affinity mask: {gpu_affinity}" + ) if affinity != dragon_policy.Policy.Affinity.DEFAULT: return dragon_policy.Policy( placement=dragon_policy.Policy.Placement.HOST_NAME, From 263e3c72738ab4315652dcb3cf23bbfb07c3a677 Mon Sep 17 00:00:00 2001 From: Chris McBride <3595025+ankona@users.noreply.github.com> Date: Thu, 1 Aug 2024 23:16:10 -0400 Subject: [PATCH 13/60] Fix dragon installation issues (#652) Fix two dragon installation issues: 1. Fix issue where search for `*.whl` files may include previously extracted versions of the dragon package 2. Fix issue where LD_LIBRARY_PATH is incorrectly directed to `dragon-0.9` folder by using the generated `.env` file created from `smart build --dragon` [ committed by @ankona ] [ approved by @AlyssaCote ] --- .github/workflows/run_tests.yml | 4 +- doc/changelog.md | 1 + smartsim/_core/_cli/scripts/dragon_install.py | 40 +++---- tests/test_dragon_installer.py | 100 ++++++++++++++++-- 4 files changed, 117 insertions(+), 28 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 6f6648728e..b8e96f05bb 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -119,7 +119,9 @@ jobs: if: (contains( matrix.os, 'ubuntu' ) || contains( matrix.os, 'macos-12')) && ( matrix.subset == 'dragon' ) run: | smart build --device cpu --onnx --dragon -v - echo "LD_LIBRARY_PATH=$(python -c 'import site; print(site.getsitepackages()[0])')/smartsim/_core/.dragon/dragon-0.9/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + SP=$(python -c 'import site; print(site.getsitepackages()[0])')/smartsim/_core/config/dragon/.env + LLP=$(cat $SP | grep LD_LIBRARY_PATH | awk '{split($0, array, "="); print array[2]}') + echo "LD_LIBRARY_PATH=$LLP:$LD_LIBRARY_PATH" >> $GITHUB_ENV - name: Install ML Runtimes with Smart (no ONNX,TF on Apple Silicon) if: contains( matrix.os, 'macos-14' ) diff --git a/doc/changelog.md b/doc/changelog.md index a954385cae..0ada4e4ec3 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Fix dragon package installation bug - Adjust schemas for better performance - Add TorchWorker first implementation and mock inference app example - Add error handling in Worker Manager pipeline diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index a2e8ed36ff..03a128ab86 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -155,38 +155,40 @@ def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib :param working_dir: location in file system where assets should be written :param asset: GitHub release asset to retrieve - :returns: path to the downloaded asset""" - if working_dir.exists() and list(working_dir.rglob("*.whl")): - return working_dir + :returns: path to the directory containing the extracted release asset""" + download_dir = working_dir / str(asset.id) + + # if we've previously downloaded the release and still have + # wheels laying around, use that cached version instead + if download_dir.exists() and list(download_dir.rglob("*.whl")): + return download_dir archive = WebTGZ(asset.browser_download_url) - archive.extract(working_dir) + archive.extract(download_dir) - logger.debug(f"Retrieved {asset.browser_download_url} to {working_dir}") - return working_dir + logger.debug(f"Retrieved {asset.browser_download_url} to {download_dir}") + return download_dir def install_package(asset_dir: pathlib.Path) -> int: """Install the package found in `asset_dir` into the current python environment :param asset_dir: path to a decompressed archive contents for a release asset""" - wheels = asset_dir.rglob("*.whl") - wheel_path = next(wheels, None) - if not wheel_path: - logger.error(f"No wheel found for package in {asset_dir}") + found_wheels = list(asset_dir.rglob("*.whl")) + if not found_wheels: + logger.error(f"No wheel(s) found for package in {asset_dir}") return 1 - create_dotenv(wheel_path.parent) + create_dotenv(found_wheels[0].parent) - while wheel_path is not None: - logger.info(f"Installing package: {wheel_path.absolute()}") + try: + wheels = list(map(str, found_wheels)) + logger.info("Installing packages:\n%s", "\n".join(wheels)) - try: - pip("install", "--force-reinstall", str(wheel_path), "numpy<2") - wheel_path = next(wheels, None) - except Exception: - logger.error(f"Unable to install from {asset_dir}") - return 1 + pip("install", *wheels) + except Exception: + logger.error(f"Unable to install from {asset_dir}") + return 1 return 0 diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py index b23a1a7ef0..4bf589ad4c 100644 --- a/tests/test_dragon_installer.py +++ b/tests/test_dragon_installer.py @@ -44,6 +44,7 @@ retrieve_asset, retrieve_asset_info, ) +from smartsim._core._install.builder import WebTGZ from smartsim.error.errors import SmartSimCLIActionCancelled # The tests in this file belong to the group_a group @@ -58,14 +59,25 @@ def test_archive(test_dir: str, archive_path: pathlib.Path) -> pathlib.Path: """Fixture for returning a simple tarfile to test on""" num_files = 10 + + archive_name = archive_path.name + archive_name = archive_name.replace(".tar.gz", "") + with tarfile.TarFile.open(archive_path, mode="w:gz") as tar: - mock_whl = pathlib.Path(test_dir) / "mock.whl" + mock_whl = pathlib.Path(test_dir) / archive_name / f"{archive_name}.whl" + mock_whl.parent.mkdir(parents=True, exist_ok=True) mock_whl.touch() + tar.add(mock_whl) + for i in range(num_files): - content = pathlib.Path(test_dir) / f"{i:04}.txt" + content = pathlib.Path(test_dir) / archive_name / f"{i:04}.txt" content.write_text(f"i am file {i}\n") tar.add(content) + content.unlink() + + mock_whl.unlink() + return archive_path @@ -118,6 +130,7 @@ def test_assets(monkeypatch: pytest.MonkeyPatch) -> t.Dict[str, GitReleaseAsset] _git_attr(value=f"http://foo/{archive_name}"), ) monkeypatch.setattr(asset, "_name", _git_attr(value=archive_name)) + monkeypatch.setattr(asset, "_id", _git_attr(value=123)) assets.append(asset) return assets @@ -149,11 +162,22 @@ def test_retrieve_cached( test_archive: pathlib.Path, monkeypatch: pytest.MonkeyPatch, ) -> None: - """Verify that a previously retrieved asset archive is re-used""" - with tarfile.TarFile.open(test_archive) as tar: - tar.extractall(test_dir) + """Verify that a previously retrieved asset archive is re-used and the + release asset retrieval is not attempted""" - ts1 = test_archive.parent.stat().st_ctime + asset_id = 123 + + def mock_webtgz_extract(self_, target_) -> None: + mock_extraction_dir = pathlib.Path(target_) + with tarfile.TarFile.open(test_archive) as tar: + tar.extractall(mock_extraction_dir) + + # we'll use the mock extract to create the files that would normally be downloaded + expected_output_dir = test_archive.parent / str(asset_id) + mock_webtgz_extract(None, expected_output_dir) + + # get modification time of directory holding the "downloaded" archive + ts1 = expected_output_dir.stat().st_ctime requester = Requester( auth=None, @@ -174,16 +198,76 @@ def test_retrieve_cached( # ensure mocked asset has values that we use... monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo")) monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name)) + monkeypatch.setattr(asset, "_id", _git_attr(value=asset_id)) + # show that retrieving an asset w/a different ID results in ignoring + # other wheels from prior downloads in the parent directory of the asset asset_path = retrieve_asset(test_archive.parent, asset) ts2 = asset_path.stat().st_ctime + # NOTE: the file should be written to a subdir based on the asset ID assert ( - asset_path == test_archive.parent - ) # show that the expected path matches the output path + asset_path == expected_output_dir + ) # shows that the expected path matches the output path assert ts1 == ts2 # show that the file wasn't changed... +def test_retrieve_updated( + test_archive: pathlib.Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Verify that a previously retrieved asset archive is not re-used if a new + version is found""" + + old_asset_id = 100 + asset_id = 123 + + def mock_webtgz_extract(self_, target_) -> None: + mock_extraction_dir = pathlib.Path(target_) + with tarfile.TarFile.open(test_archive) as tar: + tar.extractall(mock_extraction_dir) + + # we'll use the mock extract to create the files that would normally be downloaded + expected_output_dir = test_archive.parent / str(asset_id) + old_output_dir = test_archive.parent / str(old_asset_id) + mock_webtgz_extract(None, old_output_dir) + + requester = Requester( + auth=None, + base_url="https://github.com", + user_agent="mozilla", + per_page=10, + verify=False, + timeout=1, + retry=1, + pool_size=1, + ) + headers = {"mock-header": "mock-value"} + attributes = {"mock-attr": "mock-attr-value"} + completed = True + + asset = GitReleaseAsset(requester, headers, attributes, completed) + + # ensure mocked asset has values that we use... + monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo")) + monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name)) + monkeypatch.setattr(asset, "_id", _git_attr(value=asset_id)) + monkeypatch.setattr( + WebTGZ, + "extract", + lambda s_, t_: mock_webtgz_extract(s_, expected_output_dir), + ) # mock the retrieval of the updated archive + + # tell it to retrieve. it should return the path to the new download, not the old one + asset_path = retrieve_asset(test_archive.parent, asset) + + # sanity check we don't have the same paths + assert old_output_dir != expected_output_dir + + # verify the "cached" copy wasn't used + assert asset_path == expected_output_dir + + @pytest.mark.parametrize( "dragon_pin,pyv,is_found,is_crayex", [ From 0453b8b5805b563d159cae3b8e32df319a216260 Mon Sep 17 00:00:00 2001 From: Chris McBride <3595025+ankona@users.noreply.github.com> Date: Wed, 7 Aug 2024 16:23:18 -0400 Subject: [PATCH 14/60] Add FeatureStore descriptor to tensor & model keys (#633) - Enables using multiple feature stores by enhancing the existing tensor/model-key classes to include the feature store descriptor. - Update the `EnvironmentConfigLoader` to retrieve _multiple_ feature stores from environment using the prior key as a prefix to query with - Minor (lift & shift) refactor of top-level functions in worker manager module to reduce number of touch-points for converting to `FeatureStoreKey` from capnproto type - now, only `worker.py` deals with this conversion. [ committed by @ankona] [ approved by @mellis13 @AlyssaCote @al-rigazzi ] --- doc/changelog.md | 1 + ex/high_throughput_inference/mli_driver.py | 21 +- ex/high_throughput_inference/mock_app.py | 42 ++- .../standalone_workermanager.py | 29 +- smartsim/_core/_cli/scripts/dragon_install.py | 21 +- .../_core/launcher/dragon/dragonBackend.py | 20 +- smartsim/_core/mli/comm/channel/channel.py | 2 + .../_core/mli/comm/channel/dragonchannel.py | 17 ++ smartsim/_core/mli/comm/channel/dragonfli.py | 28 +- .../infrastructure/control/workermanager.py | 279 ++++++++---------- .../mli/infrastructure/environmentloader.py | 98 ++++-- .../storage/dragonfeaturestore.py | 63 +++- .../infrastructure/storage/featurestore.py | 29 +- .../_core/mli/infrastructure/worker/worker.py | 147 +++++++-- smartsim/_core/mli/message_handler.py | 16 +- .../mli_schemas/data/data_references.capnp | 2 + .../data/data_references_capnp.pyi | 2 + tests/dragon/{utils => }/featurestore.py | 88 ++++-- tests/dragon/test_environment_loader.py | 111 +++---- tests/dragon/test_error_handling.py | 85 ++++-- tests/dragon/test_reply_building.py | 31 +- tests/{mli => dragon}/test_worker_manager.py | 82 ++--- tests/dragon/utils/channel.py | 36 ++- tests/dragon/utils/worker.py | 30 +- tests/mli/channel.py | 36 ++- tests/mli/featurestore.py | 83 ++++-- .../mli/test_core_machine_learning_worker.py | 107 ++++--- tests/mli/test_torch_worker.py | 4 +- tests/mli/worker.py | 30 +- tests/test_dragon_run_policy.py | 5 - .../test_build_model_key.py | 7 +- .../test_build_tensor_key.py | 6 +- .../test_output_descriptor.py | 3 +- tests/test_message_handler/test_request.py | 12 +- tests/test_message_handler/test_response.py | 5 +- 35 files changed, 965 insertions(+), 613 deletions(-) rename tests/dragon/{utils => }/featurestore.py (69%) rename tests/{mli => dragon}/test_worker_manager.py (77%) diff --git a/doc/changelog.md b/doc/changelog.md index 0ada4e4ec3..809ad5e8ea 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Enable dynamic feature store selection - Fix dragon package installation bug - Adjust schemas for better performance - Add TorchWorker first implementation and mock inference app example diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 6da559aa6f..0cf87ef2e2 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -1,5 +1,4 @@ - - +import argparse import os import base64 import cloudpickle @@ -26,11 +25,23 @@ torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii") -worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device, "--worker_class", torch_worker_str]) +worker_manager_rs = exp.create_run_settings( + sys.executable, + [ + worker_manager_script_name, + "--device", + device, + "--worker_class", + torch_worker_str, + ], +) worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) -app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) +app_rs = exp.create_run_settings( + sys.executable, + exe_args=[app_script_name, "--device", device], +) app = exp.create_model("app", run_settings=app_rs) app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) @@ -47,4 +58,4 @@ break time.sleep(5) -print("Exiting.") \ No newline at end of file +print("Exiting.") diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index e244c93e0f..3a5169a668 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -44,16 +44,21 @@ import numbers from collections import OrderedDict +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( + DragonFeatureStore, +) from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger logger = get_logger("App") + class ProtoClient: def __init__(self, timing_on: bool): connect_to_infrastructure() - ddict_str = os.environ["SS_DRG_DDICT"] + ddict_str = os.environ["SS_INFRA_BACKBONE"] self._ddict = DDict.attach(ddict_str) + self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor to_worker_fli_str = None while to_worker_fli_str is None: try: @@ -88,17 +93,23 @@ def start_timings(self, batch_size: int): def end_timings(self): if self._timing_on: self._add_label_to_timings("total_time") - self._timings["total_time"].append(self._format_number(time.perf_counter()-self._start)) + self._timings["total_time"].append( + self._format_number(time.perf_counter() - self._start) + ) def measure_time(self, label: str): if self._timing_on: self._add_label_to_timings(label) - self._timings[label].append(self._format_number(time.perf_counter()-self._interm)) + self._timings[label].append( + self._format_number(time.perf_counter() - self._interm) + ) self._interm = time.perf_counter() def print_timings(self, to_file: bool = False): print(" ".join(self._timings.keys())) - value_array = numpy.array([value for value in self._timings.values()], dtype=float) + value_array = numpy.array( + [value for value in self._timings.values()], dtype=float + ) value_array = numpy.transpose(value_array) for i in range(value_array.shape[0]): print(" ".join(self._format_number(value) for value in value_array[i])) @@ -106,21 +117,21 @@ def print_timings(self, to_file: bool = False): numpy.save("timings.npy", value_array) numpy.savetxt("timings.txt", value_array) - def run_model(self, model: bytes | str, batch: torch.Tensor): tensors = [batch.numpy()] self.start_timings(batch.shape[0]) built_tensor_desc = MessageHandler.build_tensor_descriptor( - "c", "float32", list(batch.shape)) + "c", "float32", list(batch.shape) + ) self.measure_time("build_tensor_descriptor") built_model = None if isinstance(model, str): - model_arg = MessageHandler.build_model_key(model) + model_arg = MessageHandler.build_model_key(model, self._backbone_descriptor) else: model_arg = MessageHandler.build_model(model, "resnet-50", "1.0") request = MessageHandler.build_request( reply_channel=self._from_worker_ch_serialized, - model= model_arg, + model=model_arg, inputs=[built_tensor_desc], outputs=[], output_descriptors=[], @@ -129,10 +140,12 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.measure_time("build_request") request_bytes = MessageHandler.serialize_request(request) self.measure_time("serialize_request") - with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: + with self._to_worker_fli.sendh( + timeout=None, stream_channel=self._to_worker_ch + ) as to_sendh: to_sendh.send_bytes(request_bytes) for t in tensors: - to_sendh.send_bytes(t.tobytes()) #TODO NOT FAST ENOUGH!!! + to_sendh.send_bytes(t.tobytes()) # TODO NOT FAST ENOUGH!!! # to_sendh.send_bytes(bytes(t.data)) logger.info(f"Message size: {len(request_bytes)} bytes") @@ -159,7 +172,7 @@ def set_model(self, key: str, model: bytes): self._ddict[key] = model -class ResNetWrapper(): +class ResNetWrapper: def __init__(self, name: str, model: str): self._model = torch.jit.load(model) self._name = name @@ -168,7 +181,7 @@ def __init__(self, name: str, model: str): torch.jit.save(scripted, buffer) self._serialized_model = buffer.getvalue() - def get_batch(self, batch_size: int=32): + def get_batch(self, batch_size: int = 32): return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) @property @@ -179,6 +192,7 @@ def model(self): def name(self): return self._name + if __name__ == "__main__": parser = argparse.ArgumentParser("Mock application") @@ -194,8 +208,8 @@ def name(self): for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]: logger.info(f"Batch size: {batch_size}") - for iteration_number in range(total_iterations + int(batch_size==1)): + for iteration_number in range(total_iterations + int(batch_size == 1)): logger.info(f"Iteration: {iteration_number}") client.run_model(resnet.name, resnet.get_batch(batch_size)) - client.print_timings(to_file=True) \ No newline at end of file + client.print_timings(to_file=True) diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index c56e11a7c3..2b5ba7df42 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -31,17 +31,19 @@ from dragon.data.ddict.ddict import DDict from dragon.utils import b64decode, b64encode from dragon.globalservices.api_setup import connect_to_infrastructure + # isort: on import argparse import base64 import cloudpickle -import pickle +import optparse import os from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel -from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import DragonFeatureStore from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel -from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( + DragonFeatureStore, +) from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader @@ -67,7 +69,7 @@ args = parser.parse_args() connect_to_infrastructure() - ddict_str = os.environ["SS_DRG_DDICT"] + ddict_str = os.environ["SS_INFRA_BACKBONE"] ddict = DDict.attach(ddict_str) to_worker_channel = Channel.make_process_local() @@ -75,22 +77,23 @@ to_worker_fli_serialized = to_worker_fli.serialize() ddict["to_worker_fli"] = to_worker_fli_serialized - torch_worker = cloudpickle.loads(base64.b64decode(args.worker_class.encode('ascii')))() - - dfs = DragonFeatureStore(ddict) - comm_channel = DragonFLIChannel(to_worker_fli_serialized) + worker_type_name = base64.b64decode(args.worker_class.encode("ascii")) + torch_worker = cloudpickle.loads(worker_type_name)() - os.environ["SSFeatureStore"] = base64.b64encode(pickle.dumps(dfs)).decode("utf-8") - os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + os.environ["SS_REQUEST_QUEUE"] = descriptor - config_loader = EnvironmentConfigLoader() + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=DragonCommChannel, + queue_factory=DragonFLIChannel.from_descriptor, + ) worker_manager = WorkerManager( config_loader=config_loader, worker=torch_worker, as_service=True, cooldown=10, - comm_channel_type=DragonCommChannel, - device = args.device, + device=args.device, ) worker_manager.execute() diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index 03a128ab86..f88af4eb4f 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -1,7 +1,9 @@ import os import pathlib +import shutil import sys import typing as t +from urllib.request import urlretrieve from github import Github from github.GitReleaseAsset import GitReleaseAsset @@ -160,13 +162,26 @@ def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib # if we've previously downloaded the release and still have # wheels laying around, use that cached version instead - if download_dir.exists() and list(download_dir.rglob("*.whl")): + if download_dir.exists() or list(download_dir.rglob("*.whl")): return download_dir - archive = WebTGZ(asset.browser_download_url) + download_dir.mkdir(parents=True, exist_ok=True) + + # grab a copy of the complete asset + asset_path = download_dir / str(asset.name) + download_url = asset.browser_download_url + + try: + urlretrieve(download_url, str(asset_path)) + logger.debug(f"Retrieved asset {asset.name} from {download_url}") + except Exception: + logger.exception(f"Unable to download asset from: {download_url}") + + # extract the asset + archive = WebTGZ(download_url) archive.extract(download_dir) - logger.debug(f"Retrieved {asset.browser_download_url} to {download_dir}") + logger.debug(f"Extracted {download_url} to {download_dir}") return download_dir diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 445538f20e..4fe6d55ad6 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -457,7 +457,6 @@ def create_run_policy( if isinstance(request, DragonRunRequest): run_request: DragonRunRequest = request - affinity = dragon_policy.Policy.Affinity.DEFAULT cpu_affinity: t.List[int] = [] gpu_affinity: t.List[int] = [] @@ -465,25 +464,20 @@ def create_run_policy( if run_request.policy is not None: # Affinities are not mutually exclusive. If specified, both are used if run_request.policy.cpu_affinity: - affinity = dragon_policy.Policy.Affinity.SPECIFIC cpu_affinity = run_request.policy.cpu_affinity if run_request.policy.gpu_affinity: - affinity = dragon_policy.Policy.Affinity.SPECIFIC gpu_affinity = run_request.policy.gpu_affinity logger.debug( - f"Affinity strategy: {affinity}, " f"CPU affinity mask: {cpu_affinity}, " f"GPU affinity mask: {gpu_affinity}" ) - if affinity != dragon_policy.Policy.Affinity.DEFAULT: - return dragon_policy.Policy( - placement=dragon_policy.Policy.Placement.HOST_NAME, - host_name=node_name, - affinity=affinity, - cpu_affinity=cpu_affinity, - gpu_affinity=gpu_affinity, - ) + return dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=node_name, + cpu_affinity=cpu_affinity, + gpu_affinity=gpu_affinity, + ) return dragon_policy.Policy( placement=dragon_policy.Policy.Placement.HOST_NAME, @@ -521,7 +515,7 @@ def _start_steps(self) -> None: env={ **request.current_env, **request.env, - "SS_DRG_DDICT": self.infra_ddict, + "SS_INFRA_BACKBONE": self.infra_ddict, }, stdout=dragon_process.Popen.PIPE, stderr=dragon_process.Popen.PIPE, diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index a3cce21814..d918591264 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -42,11 +42,13 @@ def __init__(self, descriptor: t.Union[str, bytes]) -> None: @abstractmethod def send(self, value: bytes) -> None: """Send a message through the underlying communication channel + :param value: The value to send""" @abstractmethod def recv(self) -> t.List[bytes]: """Receieve a message through the underlying communication channel + :returns: the received message""" @property diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index 672fce75b2..80fdd9cdc6 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import base64 import sys import typing as t @@ -55,7 +56,23 @@ def send(self, value: bytes) -> None: def recv(self) -> t.List[bytes]: """Receieve a message through the underlying communication channel + :returns: the received message""" with self._channel.recvh(timeout=None) as recvh: message_bytes: bytes = recvh.recv_bytes(timeout=None) return [message_bytes] + + @classmethod + def from_descriptor( + cls, + descriptor: str, + ) -> "DragonCommChannel": + """A factory method that creates an instance from a descriptor string + + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached DragonCommChannel""" + try: + return DragonCommChannel(base64.b64decode(descriptor)) + except: + logger.error(f"Failed to create dragon comm channel: {descriptor}") + raise diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 28b4c2bf3b..4636894bdd 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -30,7 +30,7 @@ # isort: on -import sys +import base64 import typing as t import smartsim._core.mli.comm.channel.channel as cch @@ -43,7 +43,11 @@ class DragonFLIChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon FLI Channel""" def __init__(self, fli_desc: bytes, sender_supplied: bool = True) -> None: - """Initialize the DragonFLIChannel instance""" + """Initialize the DragonFLIChannel instance + + :param fli_desc: the descriptor of the FLI channel to attach + :param sender_supplied: flag indicating if the FLI uses sender-supplied streams + """ super().__init__(fli_desc) # todo: do we need memory pool information to construct the channel correctly? self._fli: "fli" = fli.FLInterface.attach(fli_desc) @@ -53,12 +57,14 @@ def __init__(self, fli_desc: bytes, sender_supplied: bool = True) -> None: def send(self, value: bytes) -> None: """Send a message through the underlying communication channel + :param value: The value to send""" with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: sendh.send_bytes(value) def recv(self) -> t.List[bytes]: """Receieve a message through the underlying communication channel + :returns: the received message""" messages = [] eot = False @@ -70,3 +76,21 @@ def recv(self) -> t.List[bytes]: except fli.FLIEOT as exc: eot = True return messages + + @classmethod + def from_descriptor( + cls, + descriptor: str, + ) -> "DragonFLIChannel": + """A factory method that creates an instance from a descriptor string + + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached DragonFLIChannel""" + try: + return DragonFLIChannel( + fli_desc=base64.b64decode(descriptor), + sender_supplied=True, + ) + except: + logger.error(f"Error while creating DragonFLIChannel: {descriptor}") + raise diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 27f5bfc971..dcc35ae831 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -24,26 +24,16 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import sys - -# isort: off -import dragon -from dragon import fli - -# isort: on - import time import typing as t -import numpy as np +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore -from .....error import SmartSimError from .....log import get_logger from ....entrypoints.service import Service from ...comm.channel.channel import CommChannelBase from ...comm.channel.dragonchannel import DragonCommChannel from ...infrastructure.environmentloader import EnvironmentConfigLoader -from ...infrastructure.storage.featurestore import FeatureStore from ...infrastructure.worker.worker import ( InferenceReply, InferenceRequest, @@ -51,113 +41,24 @@ MachineLearningWorkerBase, ) from ...message_handler import MessageHandler -from ...mli_schemas.response.response_capnp import Response, ResponseBuilder +from ...mli_schemas.response.response_capnp import ResponseBuilder if t.TYPE_CHECKING: from dragon.fli import FLInterface - from smartsim._core.mli.mli_schemas.model.model_capnp import Model from smartsim._core.mli.mli_schemas.response.response_capnp import Status - from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor logger = get_logger(__name__) -def deserialize_message( - data_blob: bytes, - channel_type: t.Type[CommChannelBase], - device: t.Literal["cpu", "gpu"], -) -> InferenceRequest: - """Deserialize a message from a byte stream into an InferenceRequest - :param data_blob: The byte stream to deserialize""" - # todo: consider moving to XxxCore and only making - # workers implement the inputs and model conversion? - - # alternatively, consider passing the capnproto models - # to this method instead of the data_blob... - - # something is definitely wrong here... client shouldn't have to touch - # callback (or batch size) - - request = MessageHandler.deserialize_request(data_blob) - # return request - model_key: t.Optional[str] = None - model_bytes: t.Optional[Model] = None - - if request.model.which() == "key": - model_key = request.model.key.key - elif request.model.which() == "data": - model_bytes = request.model.data - - callback_key = request.replyChannel.descriptor - - # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel` - comm_channel = channel_type(callback_key) - # comm_channel = DragonCommChannel(request.replyChannel) - - input_keys: t.Optional[t.List[str]] = None - input_bytes: t.Optional[t.List[bytes]] = None - - output_keys: t.Optional[t.List[str]] = None - - input_meta: t.Optional[t.List[TensorDescriptor]] = None - - if request.input.which() == "keys": - input_keys = [input_key.key for input_key in request.input.keys] - elif request.input.which() == "descriptors": - input_meta = request.input.descriptors # type: ignore - - if request.output: - output_keys = [tensor_key.key for tensor_key in request.output] - - inference_request = InferenceRequest( - model_key=model_key, - callback=comm_channel, - raw_inputs=input_bytes, - input_keys=input_keys, - input_meta=input_meta, - output_keys=output_keys, - raw_model=model_bytes, - batch_size=0, - ) - return inference_request - - def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: + """Build a response indicating a failure occurred + :param status: The status of the response + :param message: The error message to include in the response""" return MessageHandler.build_response( status=status, message=message, - result=[], - custom_attributes=None, - ) - - -def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: - prepared_outputs: t.List[t.Any] = [] - if reply.output_keys: - for key in reply.output_keys: - if not key: - continue - msg_key = MessageHandler.build_tensor_key(key) - prepared_outputs.append(msg_key) - elif reply.outputs: - for _ in reply.outputs: - msg_tensor_desc = MessageHandler.build_tensor_descriptor( - "c", - "float32", - [1], - ) - prepared_outputs.append(msg_tensor_desc) - return prepared_outputs - - -def build_reply(reply: InferenceReply) -> ResponseBuilder: - results = prepare_outputs(reply) - - return MessageHandler.build_response( - status=reply.status_enum, - message=reply.message, - result=results, + result=None, custom_attributes=None, ) @@ -194,73 +95,127 @@ def __init__( worker: MachineLearningWorkerBase, as_service: bool = False, cooldown: int = 0, - comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, device: t.Literal["cpu", "gpu"] = "cpu", ) -> None: """Initialize the WorkerManager + :param config_loader: Environment config loader that loads the task queue and feature store :param workers: A worker to manage :param as_service: Specifies run-once or run-until-complete behavior of service :param cooldown: Number of seconds to wait before shutting down after shutdown criteria are met - :param comm_channel_type: The type of communication channel used for callbacks + :param device: The type of hardware the workers must be executed on """ super().__init__(as_service, cooldown) self._task_queue: t.Optional[CommChannelBase] = config_loader.get_queue() """the queue the manager monitors for new tasks""" - self._feature_store: t.Optional[FeatureStore] = ( - config_loader.get_feature_store() - ) - """a feature store to retrieve models from""" self._worker = worker """The ML Worker implementation""" - self._comm_channel_type = comm_channel_type + self._callback_factory = config_loader._callback_factory """The type of communication channel to construct for callbacks""" self._device = device """Device on which workers need to run""" self._cached_models: dict[str, t.Any] = {} """Dictionary of previously loaded models""" + self._feature_stores: t.Dict[str, FeatureStore] = {} + """A collection of attached feature stores""" + self._featurestore_factory = config_loader._featurestore_factory + """A factory method to create a desired feature store client type""" + self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone() + """A standalone, system-created feature store used to share internal + information among MLI components""" - def _validate_request(self, request: InferenceRequest) -> bool: - """Ensure the request can be processed. - :param request: The request to validate - :return: True if the request is valid, False otherwise""" - if not self._feature_store: - if request.model_key: - logger.error("Unable to load model by key without feature store") - return False - - if request.input_keys: - logger.error("Unable to load inputs by key without feature store") - return False - - if request.output_keys: - logger.error("Unable to persist outputs by key without feature store") - return False - - if not request.model_key and not request.raw_model: - logger.error("Unable to continue without model bytes or feature store key") - return False + def _check_feature_stores(self, request: InferenceRequest) -> bool: + """Ensures that all feature stores required by the request are available - if not request.input_keys and not request.raw_inputs: - logger.error("Unable to continue without input bytes or feature store keys") + :param request: The request to validate + :returns: False if feature store validation fails for the request, True otherwise + """ + # collect all feature stores required by the request + fs_model: t.Set[str] = set() + if request.model_key: + fs_model = {request.model_key.descriptor} + fs_inputs = {key.descriptor for key in request.input_keys} + fs_outputs = {key.descriptor for key in request.output_keys} + + # identify which feature stores are requested and unknown + fs_desired = fs_model.union(fs_inputs).union(fs_outputs) + fs_actual = {item.descriptor for item in self._feature_stores.values()} + fs_missing = fs_desired - fs_actual + + if self._featurestore_factory is None: + logger.error("No feature store factory configured") return False - if request.callback is None: - logger.error("No callback channel provided in request") - return False + # create the feature stores we need to service request + if fs_missing: + logger.debug(f"Adding feature store(s): {fs_missing}") + for descriptor in fs_missing: + feature_store = self._featurestore_factory(descriptor) + self._feature_stores[descriptor] = feature_store return True + def _check_model(self, request: InferenceRequest) -> bool: + """Ensure that a model is available for the request + + :param request: The request to validate + :returns: False if model validation fails for the request, True otherwise + """ + if request.model_key or request.raw_model: + return True + + logger.error("Unable to continue without model bytes or feature store key") + return False + + def _check_inputs(self, request: InferenceRequest) -> bool: + """Ensure that inputs are available for the request + + :param request: The request to validate + :returns: False if input validation fails for the request, True otherwise + """ + if request.input_keys or request.raw_inputs: + return True + + logger.error("Unable to continue without input bytes or feature store keys") + return False + + def _check_callback(self, request: InferenceRequest) -> bool: + """Ensure that a callback channel is available for the request + + :param request: The request to validate + :returns: False if callback validation fails for the request, True otherwise + """ + if request.callback is not None: + return True + + logger.error("No callback channel provided in request") + return False + + def _validate_request(self, request: InferenceRequest) -> bool: + """Ensure the request can be processed + + :param request: The request to validate + :return: False if the request fails any validation checks, True otherwise""" + checks = [ + self._check_feature_stores(request), + self._check_model(request), + self._check_inputs(request), + self._check_callback(request), + ] + + return all(checks) + def _on_iteration(self) -> None: """Executes calls to the machine learning worker implementation to complete + the inference pipeline""" logger.debug("executing worker manager pipeline") if self._task_queue is None: - logger.warning("No queue to check for tasks") + logger.error("No queue to check for tasks") return timings = [] # timing @@ -279,15 +234,19 @@ def _on_iteration(self) -> None: tensor_bytes_list = bytes_list[1:] interm = time.perf_counter() # timing - request = deserialize_message( - request_bytes, self._comm_channel_type, self._device + request = self._worker.deserialize_message( + request_bytes, self._callback_factory ) if request.input_meta and tensor_bytes_list: request.raw_inputs = tensor_bytes_list if not self._validate_request(request): - return + exception_handler( + ValueError("Error validating the request"), + request.callback, + "Error validating the request.", + ) timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing @@ -302,17 +261,20 @@ def _on_iteration(self) -> None: "Could not find model key or model.", ) return - if request.model_key in self._cached_models: + + if request.model_key.key in self._cached_models: timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing - model_result = LoadModelResult(self._cached_models[request.model_key]) + model_result = LoadModelResult( + self._cached_models[request.model_key.key] + ) else: timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing try: fetch_model_result = self._worker.fetch_model( - request, self._feature_store + request, self._feature_stores ) except Exception as e: exception_handler( @@ -328,10 +290,12 @@ def _on_iteration(self) -> None: fetch_result=fetch_model_result, device=self._device, ) - self._cached_models[request.model_key] = model_result.model + self._cached_models[request.model_key.key] = model_result.model except Exception as e: exception_handler( - e, request.callback, "Failed while loading the model." + e, + request.callback, + "Failed while loading model from feature store.", ) return @@ -340,7 +304,7 @@ def _on_iteration(self) -> None: interm = time.perf_counter() # timing try: fetch_model_result = self._worker.fetch_model( - request, self._feature_store + request, self._feature_stores ) except Exception as e: exception_handler( @@ -356,14 +320,18 @@ def _on_iteration(self) -> None: ) except Exception as e: exception_handler( - e, request.callback, "Failed while loading the model." + e, + request.callback, + "Failed while loading model from feature store.", ) return timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing try: - fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) + fetch_input_result = self._worker.fetch_inputs( + request, self._feature_stores + ) except Exception as e: exception_handler(e, request.callback, "Failed while fetching the inputs.") return @@ -407,9 +375,7 @@ def _on_iteration(self) -> None: if request.output_keys: try: reply.output_keys = self._worker.place_output( - request, - transformed_output, - self._feature_store, + request, transformed_output, self._feature_stores ) except Exception as e: exception_handler( @@ -427,7 +393,14 @@ def _on_iteration(self) -> None: else: reply.status_enum = "complete" reply.message = "Success" - response = build_reply(reply) + + results = self._worker.prepare_outputs(reply) + response = MessageHandler.build_response( + status=reply.status_enum, + message=reply.message, + result=results, + custom_attributes=None, + ) timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index 9f6770623d..b4b9e565ce 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -24,44 +24,82 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import base64 import os -import pickle import typing as t -from dragon.fli import FLInterface # pylint: disable=all - -from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel +from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim.log import get_logger + +logger = get_logger(__name__) class EnvironmentConfigLoader: """ - Facilitates the loading of a FeatureStore and Queue - into the WorkerManager. + Facilitates the loading of a FeatureStore and Queue into the WorkerManager. """ - def __init__(self) -> None: - self._feature_store_descriptor: t.Optional[str] = os.getenv( - "SSFeatureStore", None - ) - self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None) - self.feature_store: t.Optional[FeatureStore] = None - self.queue: t.Optional[DragonFLIChannel] = None - - def get_feature_store(self) -> t.Optional[FeatureStore]: - """Loads the Feature Store previously set in SSFeatureStore""" - if self._feature_store_descriptor is not None: - self.feature_store = pickle.loads( - base64.b64decode(self._feature_store_descriptor) - ) - return self.feature_store - - def get_queue(self, sender_supplied: bool = True) -> t.Optional[DragonFLIChannel]: - """Returns the Queue previously set in SSQueue""" - if self._queue_descriptor is not None: - self.queue = DragonFLIChannel( - fli_desc=base64.b64decode(self._queue_descriptor), - sender_supplied=sender_supplied, - ) + def __init__( + self, + featurestore_factory: t.Callable[[str], FeatureStore], + callback_factory: t.Callable[[bytes], CommChannelBase], + queue_factory: t.Callable[[str], CommChannelBase], + ) -> None: + """Initialize the config loader instance with the factories necessary for + creating additional objects. + + :param featurestore_factory: A factory method that produces a feature store + given a descriptor + :param callback_factory: A factory method that produces a callback + channel given a descriptor + :param queue_factory: A factory method that produces a queue + channel given a descriptor""" + self.queue: t.Optional[CommChannelBase] = None + """The attached incoming event queue channel""" + self.backbone: t.Optional[FeatureStore] = None + """The attached backbone feature store""" + self._featurestore_factory = featurestore_factory + """A factory method to instantiate a FeatureStore""" + self._callback_factory = callback_factory + """A factory method to instantiate a concrete CommChannelBase + for inference callbacks""" + self._queue_factory = queue_factory + """A factory method to instantiate a concrete CommChannelBase + for inference requests""" + + def get_backbone(self) -> t.Optional[FeatureStore]: + """Attach to the backbone feature store using the descriptor found in + an environment variable. The backbone is a standalone, system-created + feature store used to share internal information among MLI components + + :returns: The attached feature store via SS_INFRA_BACKBONE""" + descriptor = os.getenv("SS_INFRA_BACKBONE", "") + + if not descriptor: + logger.warning("No backbone descriptor is configured") + return None + + if self._featurestore_factory is None: + logger.warning("No feature store factory is configured") + return None + + self.backbone = self._featurestore_factory(descriptor) + return self.backbone + + def get_queue(self) -> t.Optional[CommChannelBase]: + """Attach to a queue-like communication channel using the descriptor + found in an environment variable. + + :returns: The attached queue specified via `SS_REQUEST_QUEUE`""" + descriptor = os.getenv("SS_REQUEST_QUEUE", "") + + if not descriptor: + logger.warning("No queue descriptor is configured") + return None + + if self._queue_factory is None: + logger.warning("No queue factory is configured") + return None + + self.queue = self._queue_factory(descriptor) return self.queue diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py index af592ed0ab..e89abcd2a2 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -26,13 +26,15 @@ import typing as t -import smartsim.error as sse -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore -from smartsim.log import get_logger +# pylint: disable=import-error +# isort: off +import dragon.data.ddict.ddict as dragon_ddict -if t.TYPE_CHECKING: - from dragon.data.ddict.ddict import DDict +# isort: on +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim.error import SmartSimError +from smartsim.log import get_logger logger = get_logger(__name__) @@ -40,32 +42,67 @@ class DragonFeatureStore(FeatureStore): """A feature store backed by a dragon distributed dictionary""" - def __init__(self, storage: "DDict") -> None: - """Initialize the DragonFeatureStore instance""" + def __init__(self, storage: "dragon_ddict.DDict") -> None: + """Initialize the DragonFeatureStore instance + + :param storage: A distributed dictionary to be used as the underlying + storage mechanism of the feature store""" self._storage = storage def __getitem__(self, key: str) -> t.Union[str, bytes]: """Retrieve an item using key - :param key: Unique key of an item to retrieve from the feature store""" + + :param key: Unique key of an item to retrieve from the feature store + :returns: The value identified by the supplied key + :raises KeyError: if the key is not found in the feature store + :raises SmartSimError: if retrieval from the feature store fails""" try: value: t.Union[str, bytes] = self._storage[key] return value - except KeyError as ex: - raise ex + except KeyError: + logger.warning(f"An unknown key was requested: {key}") + raise except Exception as ex: # note: explicitly avoid round-trip to check for key existence - raise sse.SmartSimError( + raise SmartSimError( f"Could not get value for existing key {key}, error:\n{ex}" ) from ex def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None: """Assign a value using key + :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" self._storage[key] = value def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. - Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" + + :param key: Unique key of an item to retrieve from the feature store + :returns: `True` if the key is found, `False` otherwise""" return key in self._storage + + @property + def descriptor(self) -> str: + """A unique identifier enabling a client to connect to the feature store + + :returns: A descriptor encoded as a string""" + return str(self._storage.serialize()) + + @classmethod + def from_descriptor( + cls, + descriptor: str, + ) -> "DragonFeatureStore": + """A factory method that creates an instance from a descriptor string + + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached DragonFeatureStore + :raises SmartSimError: if attachment to DragonFeatureStore fails""" + try: + return DragonFeatureStore(dragon_ddict.DDict.attach(descriptor)) + except Exception as ex: + logger.error(f"Error creating dragon feature store: {descriptor}") + raise SmartSimError( + f"Error creating dragon feature store: {descriptor}" + ) from ex diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py index 553e13b10f..d511d588e1 100644 --- a/smartsim/_core/mli/infrastructure/storage/featurestore.py +++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py @@ -27,6 +27,21 @@ import typing as t from abc import ABC, abstractmethod +from pydantic import BaseModel, Field + +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class FeatureStoreKey(BaseModel): + """A key,descriptor pair enabling retrieval of an item from a feature store""" + + key: str = Field(min_length=1) + """The unique key of an item in a feature store""" + descriptor: str = Field(min_length=1) + """The unique identifier of the feature store containing the key""" + class FeatureStore(ABC): """Abstract base class providing the common interface for retrieving @@ -35,16 +50,26 @@ class FeatureStore(ABC): @abstractmethod def __getitem__(self, key: str) -> t.Union[str, bytes]: """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" @abstractmethod def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None: """Assign a value using key + :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" @abstractmethod def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. - Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" + + :param key: Unique key of an item to retrieve from the feature store + :returns: `True` if the key is found, `False` otherwise""" + + @property + @abstractmethod + def descriptor(self) -> str: + """Unique identifier enabling a client to connect to the feature store + + :returns: A descriptor encoded as a string""" diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index bb8d822311..89fb635247 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -30,11 +30,13 @@ from .....error import SmartSimError from .....log import get_logger from ...comm.channel.channel import CommChannelBase -from ...infrastructure.storage.featurestore import FeatureStore +from ...infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey +from ...message_handler import MessageHandler from ...mli_schemas.model.model_capnp import Model if t.TYPE_CHECKING: from smartsim._core.mli.mli_schemas.response.response_capnp import Status + from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor logger = get_logger(__name__) @@ -44,26 +46,32 @@ class InferenceRequest: def __init__( self, - model_key: t.Optional[str] = None, + model_key: t.Optional[FeatureStoreKey] = None, callback: t.Optional[CommChannelBase] = None, raw_inputs: t.Optional[t.List[bytes]] = None, - # todo: copying byte array is likely to create a copy of the data in - # capnproto and will be a performance issue later - input_keys: t.Optional[t.List[str]] = None, + input_keys: t.Optional[t.List[FeatureStoreKey]] = None, input_meta: t.Optional[t.List[t.Any]] = None, - output_keys: t.Optional[t.List[str]] = None, + output_keys: t.Optional[t.List[FeatureStoreKey]] = None, raw_model: t.Optional[Model] = None, batch_size: int = 0, ): """Initialize the object""" self.model_key = model_key + """A tuple containing a (key, descriptor) pair""" self.raw_model = raw_model + """Raw bytes of an ML model""" self.callback = callback + """The channel used for notification of inference completion""" self.raw_inputs = raw_inputs or [] + """Raw bytes of tensor inputs""" self.input_keys = input_keys or [] + """A list of tuples containing a (key, descriptor) pair""" self.input_meta = input_meta or [] + """Metadata about the input data""" self.output_keys = output_keys or [] + """A list of tuples containing a (key, descriptor) pair""" self.batch_size = batch_size + """The batch size to apply when batching""" class InferenceReply: @@ -72,13 +80,13 @@ class InferenceReply: def __init__( self, outputs: t.Optional[t.Collection[t.Any]] = None, - output_keys: t.Optional[t.Collection[str]] = None, + output_keys: t.Optional[t.Collection[FeatureStoreKey]] = None, status_enum: "Status" = "running", message: str = "In progress", ) -> None: """Initialize the object""" self.outputs: t.Collection[t.Any] = outputs or [] - self.output_keys: t.Collection[t.Optional[str]] = output_keys or [] + self.output_keys: t.Collection[t.Optional[FeatureStoreKey]] = output_keys or [] self.status_enum = status_enum self.message = message @@ -148,13 +156,88 @@ def __init__(self, result: bytes) -> None: class MachineLearningWorkerCore: """Basic functionality of ML worker that is shared across all worker types""" + @staticmethod + def deserialize_message( + data_blob: bytes, + callback_factory: t.Callable[[bytes], CommChannelBase], + ) -> InferenceRequest: + """Deserialize a message from a byte stream into an InferenceRequest + :param data_blob: The byte stream to deserialize + :param callback_factory: A factory method that can create an instance + of the desired concrete comm channel type + :returns: The raw input message deserialized into an InferenceRequest + """ + request = MessageHandler.deserialize_request(data_blob) + model_key: t.Optional[FeatureStoreKey] = None + model_bytes: t.Optional[Model] = None + + if request.model.which() == "key": + model_key = FeatureStoreKey( + key=request.model.key.key, + descriptor=request.model.key.featureStoreDescriptor, + ) + elif request.model.which() == "data": + model_bytes = request.model.data + + callback_key = request.replyChannel.descriptor + comm_channel = callback_factory(callback_key) + input_keys: t.Optional[t.List[FeatureStoreKey]] = None + input_bytes: t.Optional[t.List[bytes]] = None + output_keys: t.Optional[t.List[FeatureStoreKey]] = None + input_meta: t.Optional[t.List[TensorDescriptor]] = None + + if request.input.which() == "keys": + input_keys = [ + FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor) + for value in request.input.keys + ] + elif request.input.which() == "descriptors": + input_meta = request.input.descriptors # type: ignore + + if request.output: + output_keys = [ + FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor) + for value in request.output + ] + + inference_request = InferenceRequest( + model_key=model_key, + callback=comm_channel, + raw_inputs=input_bytes, + input_meta=input_meta, + input_keys=input_keys, + output_keys=output_keys, + raw_model=model_bytes, + batch_size=0, + ) + return inference_request + + @staticmethod + def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: + prepared_outputs: t.List[t.Any] = [] + if reply.output_keys: + for value in reply.output_keys: + if not value: + continue + msg_key = MessageHandler.build_tensor_key(value.key, value.descriptor) + prepared_outputs.append(msg_key) + elif reply.outputs: + for _ in reply.outputs: + msg_tensor_desc = MessageHandler.build_tensor_descriptor( + "c", + "float32", + [1], + ) + prepared_outputs.append(msg_tensor_desc) + return prepared_outputs + @staticmethod def fetch_model( - request: InferenceRequest, feature_store: t.Optional[FeatureStore] + request: InferenceRequest, feature_stores: t.Dict[str, FeatureStore] ) -> FetchModelResult: """Given a resource key, retrieve the raw model from a feature store :param request: The request that triggered the pipeline - :param feature_store: The feature store used for persistence + :param feature_stores: Available feature stores used for persistence :return: Raw bytes of the model""" if request.raw_model: @@ -164,7 +247,7 @@ def fetch_model( # short-circuit and return the directly supplied model return FetchModelResult(request.raw_model.data) - if not feature_store: + if not feature_stores: raise ValueError("Feature store is required for model retrieval") if not request.model_key: @@ -172,44 +255,47 @@ def fetch_model( "Key must be provided to retrieve model from feature store" ) + key, fsd = request.model_key.key, request.model_key.descriptor + try: - raw_bytes: bytes = t.cast(bytes, feature_store[request.model_key]) + feature_store = feature_stores[fsd] + raw_bytes: bytes = t.cast(bytes, feature_store[key]) return FetchModelResult(raw_bytes) except FileNotFoundError as ex: logger.exception(ex) - raise SmartSimError( - f"Model could not be retrieved with key {request.model_key}" - ) from ex + raise SmartSimError(f"Model could not be retrieved with key {key}") from ex @staticmethod def fetch_inputs( - request: InferenceRequest, feature_store: t.Optional[FeatureStore] + request: InferenceRequest, feature_stores: t.Dict[str, FeatureStore] ) -> FetchInputResult: """Given a collection of ResourceKeys, identify the physical location and input metadata :param request: The request that triggered the pipeline - :param feature_store: The feature store used for persistence + :param feature_stores: Available feature stores used for persistence :return: the fetched input""" if request.raw_inputs: return FetchInputResult(request.raw_inputs, request.input_meta) - if not feature_store: + if not feature_stores: raise ValueError("No input and no feature store provided") if request.input_keys: data: t.List[bytes] = [] - for input_ in request.input_keys: + + for fs_key in request.input_keys: try: - tensor_bytes = t.cast(bytes, feature_store[input_]) + feature_store = feature_stores[fs_key.descriptor] + tensor_bytes = t.cast(bytes, feature_store[fs_key.key]) data.append(tensor_bytes) except KeyError as ex: logger.exception(ex) raise SmartSimError( - f"Model could not be retrieved with key {input_}" + f"Model could not be retrieved with key {fs_key.key}" ) from ex return FetchInputResult( - data, None + data, meta=None ) # fixme: need to get both tensor and descriptor raise ValueError("No input source") @@ -231,25 +317,26 @@ def batch_requests( def place_output( request: InferenceRequest, transform_result: TransformOutputResult, - feature_store: t.Optional[FeatureStore], - ) -> t.Collection[t.Optional[str]]: + feature_stores: t.Dict[str, FeatureStore], + ) -> t.Collection[t.Optional[FeatureStoreKey]]: """Given a collection of data, make it available as a shared resource in the feature store :param request: The request that triggered the pipeline :param execute_result: Results from inference - :param feature_store: The feature store used for persistence + :param feature_stores: Available feature stores used for persistence :return: A collection of keys that were placed in the feature store""" - if not feature_store: + if not feature_stores: raise ValueError("Feature store is required for output persistence") - keys: t.List[t.Optional[str]] = [] + keys: t.List[t.Optional[FeatureStoreKey]] = [] # need to decide how to get back to original sub-batch inputs so they can be # accurately placed, datum might need to include this. # Consider parallelizing all PUT feature_store operations - for k, v in zip(request.output_keys, transform_result.outputs): - feature_store[k] = v - keys.append(k) + for fs_key, v in zip(request.output_keys, transform_result.outputs): + feature_store = feature_stores[fs_key.descriptor] + feature_store[fs_key.key] = v + keys.append(fs_key) return keys diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index 00670dce8a..ee632e24ea 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -92,16 +92,21 @@ def build_output_tensor_descriptor( return description @staticmethod - def build_tensor_key(key: str) -> data_references_capnp.TensorKey: + def build_tensor_key( + key: str, feature_store_descriptor: str + ) -> data_references_capnp.TensorKey: """ Builds a new TensorKey message with the provided key. :param key: String to set the TensorKey + :param feature_store_descriptor: A descriptor identifying the feature store + containing the key :raises ValueError: if building fails """ try: tensor_key = data_references_capnp.TensorKey.new_message() tensor_key.key = key + tensor_key.featureStoreDescriptor = feature_store_descriptor except Exception as e: raise ValueError("Error building tensor key.") from e return tensor_key @@ -126,16 +131,21 @@ def build_model(data: bytes, name: str, version: str) -> model_capnp.Model: return model @staticmethod - def build_model_key(key: str) -> data_references_capnp.ModelKey: + def build_model_key( + key: str, feature_store_descriptor: str + ) -> data_references_capnp.ModelKey: """ Builds a new ModelKey message with the provided key. :param key: String to set the ModelKey + :param feature_store_descriptor: A descriptor identifying the feature store + containing the key :raises ValueError: if building fails """ try: model_key = data_references_capnp.ModelKey.new_message() model_key.key = key + model_key.featureStoreDescriptor = feature_store_descriptor except Exception as e: raise ValueError("Error building model key.") from e return model_key @@ -433,6 +443,7 @@ def _assign_result( result: t.Union[ t.List[tensor_capnp.TensorDescriptor], t.List[data_references_capnp.TensorKey], + None, ], ) -> None: """ @@ -498,6 +509,7 @@ def build_response( result: t.Union[ t.List[tensor_capnp.TensorDescriptor], t.List[data_references_capnp.TensorKey], + None, ], custom_attributes: t.Union[ response_attributes_capnp.TorchResponseAttributes, diff --git a/smartsim/_core/mli/mli_schemas/data/data_references.capnp b/smartsim/_core/mli/mli_schemas/data/data_references.capnp index f37a957267..699abe5d22 100644 --- a/smartsim/_core/mli/mli_schemas/data/data_references.capnp +++ b/smartsim/_core/mli/mli_schemas/data/data_references.capnp @@ -28,8 +28,10 @@ struct ModelKey { key @0 :Text; + featureStoreDescriptor @1 :Text; } struct TensorKey { key @0 :Text; + featureStoreDescriptor @1 :Text; } diff --git a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi index 6f775cf8f4..bcf53e0a04 100644 --- a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi @@ -36,6 +36,7 @@ from typing import Iterator class ModelKey: key: str + featureStoreDescriptor: str @staticmethod @contextmanager def from_bytes( @@ -71,6 +72,7 @@ class ModelKeyBuilder(ModelKey): class TensorKey: key: str + featureStoreDescriptor: str @staticmethod @contextmanager def from_bytes( diff --git a/tests/dragon/utils/featurestore.py b/tests/dragon/featurestore.py similarity index 69% rename from tests/dragon/utils/featurestore.py rename to tests/dragon/featurestore.py index 93b3134318..d06035fd70 100644 --- a/tests/dragon/utils/featurestore.py +++ b/tests/dragon/featurestore.py @@ -29,6 +29,9 @@ import smartsim.error as sse from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim.log import get_logger + +logger = get_logger(__name__) class MemoryFeatureStore(FeatureStore): @@ -40,6 +43,7 @@ def __init__(self) -> None: def __getitem__(self, key: str) -> bytes: """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" if key not in self._storage: raise sse.SmartSimError(f"{key} not found in feature store") @@ -47,28 +51,43 @@ def __getitem__(self, key: str) -> bytes: def __setitem__(self, key: str, value: bytes) -> None: """Membership operator to test for a key existing within the feature store. - Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" + + :param key: Unique key of an item to retrieve from the feature store + :returns: `True` if the key is found, `False` otherwise""" self._storage[key] = value def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. - Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" + + :param key: Unique key of an item to retrieve from the feature store + :returns: `True` if the key is found, `False` otherwise""" return key in self._storage + @property + def descriptor(self) -> str: + """Unique identifier enabling a client to connect to the feature store + + :returns: A descriptor encoded as a string""" + return "file-system-fs" + class FileSystemFeatureStore(FeatureStore): """Alternative feature store implementation for testing. Stores all data on the file system""" - def __init__(self, storage_dir: t.Optional[pathlib.Path] = None) -> None: + def __init__( + self, storage_dir: t.Optional[t.Union[pathlib.Path, str]] = None + ) -> None: """Initialize the FileSystemFeatureStore instance + :param storage_dir: (optional) root directory to store all data relative to""" + if isinstance(storage_dir, str): + storage_dir = pathlib.Path(storage_dir) self._storage_dir = storage_dir def __getitem__(self, key: str) -> bytes: """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" path = self._key_path(key) if not path.exists(): @@ -77,6 +96,7 @@ def __getitem__(self, key: str) -> bytes: def __setitem__(self, key: str, value: bytes) -> None: """Assign a value using key + :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" path = self._key_path(key, create=True) @@ -84,14 +104,16 @@ def __setitem__(self, key: str, value: bytes) -> None: def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. - Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" + + :param key: Unique key of an item to retrieve from the feature store + :returns: `True` if the key is found, `False` otherwise""" path = self._key_path(key) return path.exists() def _key_path(self, key: str, create: bool = False) -> pathlib.Path: """Given a key, return a path that is optionally combined with a base directory used by the FileSystemFeatureStore. + :param key: Unique key of an item to retrieve from the feature store""" value = pathlib.Path(key) @@ -103,26 +125,32 @@ def _key_path(self, key: str, create: bool = False) -> pathlib.Path: return value - -class DragonDict: - """Mock implementation of a dragon dictionary""" - - def __init__(self) -> None: - """Initialize the mock DragonDict instance""" - self._storage: t.Dict[bytes, t.Any] = {} - - def __getitem__(self, key: bytes) -> t.Any: - """Retrieve an item using key - :param key: Unique key of an item to retrieve from the feature store""" - return self._storage[key] - - def __setitem__(self, key: bytes, value: t.Any) -> None: - """Assign a value using key - :param key: Unique key of an item to set in the feature store - :param value: Value to persist in the feature store""" - self._storage[key] = value - - def __contains__(self, key: bytes) -> bool: - """Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" - return key in self._storage + @property + def descriptor(self) -> str: + """Unique identifier enabling a client to connect to the feature store + + :returns: A descriptor encoded as a string""" + if not self._storage_dir: + raise ValueError("No storage path configured") + return self._storage_dir.as_posix() + + @classmethod + def from_descriptor( + cls, + descriptor: str, + ) -> "FileSystemFeatureStore": + """A factory method that creates an instance from a descriptor string + + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached FileSystemFeatureStore""" + try: + path = pathlib.Path(descriptor) + path.mkdir(parents=True, exist_ok=True) + if not path.is_dir(): + raise ValueError("FileSystemFeatureStore requires a directory path") + if not path.exists(): + path.mkdir(parents=True, exist_ok=True) + return FileSystemFeatureStore(path) + except: + logger.error(f"Error while creating FileSystemFeatureStore: {descriptor}") + raise diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index 00db0a9d32..6ae5d2b301 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -24,10 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import base64 -import os -import pickle - import pytest dragon = pytest.importorskip("dragon") @@ -37,13 +33,13 @@ from dragon.data.ddict.ddict import DDict from dragon.fli import DragonFLIError, FLInterface +from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( DragonFeatureStore, ) -from .utils.featurestore import MemoryFeatureStore - # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon @@ -55,97 +51,80 @@ pytest.param(b"new byte string"), ], ) -def test_environment_loader_attach_FLI(content, monkeypatch): +def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch): """A descriptor can be stored, loaded, and reattached""" chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) + monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())) - config = EnvironmentConfigLoader() + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) config_queue = config.get_queue() - new_sender = config_queue.send(content) + _ = config_queue.send(content) old_recv = queue.recvh() result, _ = old_recv.recv_bytes() assert result == content -def test_environment_loader_serialize_FLI(monkeypatch): +def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): """The serialized descriptors of a loaded and unloaded queue are the same""" chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) + monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())) - config = EnvironmentConfigLoader() + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) config_queue = config.get_queue() assert config_queue._fli.serialize() == queue.serialize() -def test_environment_loader_FLI_fails(monkeypatch): +def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): """An incorrect serialized descriptor will fails to attach""" - monkeypatch.setenv("SSQueue", "randomstring") - config = EnvironmentConfigLoader() + monkeypatch.setenv("SS_REQUEST_QUEUE", "randomstring") + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=None, + queue_factory=DragonFLIChannel.from_descriptor, + ) with pytest.raises(DragonFLIError): - config_queue = config.get_queue() - - -@pytest.mark.parametrize( - "expected_keys, expected_values", - [ - pytest.param(["key1", "key2", "key3"], ["value1", "value2", "value3"]), - pytest.param(["another key"], ["another value"]), - ], -) -def test_environment_loader_memory_featurestore( - expected_keys, expected_values, monkeypatch -): - """MemoryFeatureStores can be correctly serialized and deserialized""" - feature_store = MemoryFeatureStore() - key_value_pairs = zip(expected_keys, expected_values) - for k, v in key_value_pairs: - feature_store[k] = v - monkeypatch.setenv( - "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8") - ) - config = EnvironmentConfigLoader() - config_feature_store = config.get_feature_store() + config.get_queue() - for k, _ in key_value_pairs: - assert config_feature_store[k] == feature_store[k] +def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch): + """Verify the dragon feature store is loaded correctly by the + EnvironmentConfigLoader to demonstrate featurestore_factory correctness""" + feature_store = DragonFeatureStore(DDict()) + monkeypatch.setenv("SS_INFRA_BACKBONE", feature_store.descriptor) -@pytest.mark.parametrize( - "expected_keys, expected_values", - [ - pytest.param(["key1", "key2", "key3"], ["value1", "value2", "value3"]), - pytest.param(["another key"], ["another value"]), - ], -) -def test_environment_loader_dragon_featurestore( - expected_keys, expected_values, monkeypatch -): - """DragonFeatureStores can be correctly serialized and deserialized""" - storage = DDict() - feature_store = DragonFeatureStore(storage) - key_value_pairs = zip(expected_keys, expected_values) - for k, v in key_value_pairs: - feature_store[k] = v - monkeypatch.setenv( - "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8") + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=None, + queue_factory=None, ) - config = EnvironmentConfigLoader() - config_feature_store = config.get_feature_store() - for k, _ in key_value_pairs: - assert config_feature_store[k] == feature_store[k] + print(f"calling config.get_backbone: `{feature_store.descriptor}`") + + backbone = config.get_backbone() + assert backbone is not None def test_environment_variables_not_set(): """EnvironmentConfigLoader getters return None when environment variables are not set""" - config = EnvironmentConfigLoader() - assert config.get_feature_store() == None - assert config.get_queue() == None + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonCommChannel.from_descriptor, + ) + assert config.get_backbone() is None + assert config.get_queue() is None diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 151bdd2fcc..208ab1e5e9 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -24,8 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import base64 -import pickle from unittest.mock import MagicMock import pytest @@ -37,6 +35,7 @@ from dragon.data.ddict.ddict import DDict from dragon.fli import FLInterface +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.control.workermanager import ( WorkerManager, exception_handler, @@ -45,6 +44,7 @@ from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( DragonFeatureStore, ) +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, FetchInputResult, @@ -64,30 +64,51 @@ @pytest.fixture -def setup_worker_manager_model_bytes(test_dir, monkeypatch: pytest.MonkeyPatch): +def backbone_descriptor() -> str: + # create a shared backbone featurestore + feature_store = DragonFeatureStore(DDict()) + return feature_store.descriptor + + +@pytest.fixture +def app_feature_store() -> FeatureStore: + # create a standalone feature store to mimic a user application putting + # data into an application-owned resource (app should not access backbone) + app_fs = DragonFeatureStore(DDict()) + return app_fs + + +@pytest.fixture +def setup_worker_manager_model_bytes( + test_dir, + monkeypatch: pytest.MonkeyPatch, + backbone_descriptor: str, + app_feature_store: FeatureStore, +): integrated_worker = IntegratedTorchWorker() chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) - storage = DDict() - feature_store = DragonFeatureStore(storage) - monkeypatch.setenv( - "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8") - ) + monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())) + # Put backbone descriptor into env var for the `EnvironmentConfigLoader` + monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor) worker_manager = WorkerManager( - EnvironmentConfigLoader(), + EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ), integrated_worker, as_service=False, cooldown=3, - comm_channel_type=FileSystemCommChannel, ) - tensor_key = MessageHandler.build_tensor_key("key") + tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) + output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) model = MessageHandler.build_model(b"model", "model name", "v 0.0.1") request = MessageHandler.build_request( - test_dir, model, [tensor_key], [tensor_key], [], None + test_dir, model, [tensor_key], [output_key], [], None ) ser_request = MessageHandler.serialize_request(request) worker_manager._task_queue.send(ser_request) @@ -96,30 +117,38 @@ def setup_worker_manager_model_bytes(test_dir, monkeypatch: pytest.MonkeyPatch): @pytest.fixture -def setup_worker_manager_model_key(test_dir, monkeypatch: pytest.MonkeyPatch): +def setup_worker_manager_model_key( + test_dir: str, + monkeypatch: pytest.MonkeyPatch, + backbone_descriptor: str, + app_feature_store: FeatureStore, +): integrated_worker = IntegratedTorchWorker() chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) - storage = DDict() - feature_store = DragonFeatureStore(storage) - monkeypatch.setenv( - "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8") - ) + monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())) + # Put backbone descriptor into env var for the `EnvironmentConfigLoader` + monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor) worker_manager = WorkerManager( - EnvironmentConfigLoader(), + EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ), integrated_worker, as_service=False, cooldown=3, - comm_channel_type=FileSystemCommChannel, ) - tensor_key = MessageHandler.build_tensor_key("key") - model_key = MessageHandler.build_model_key("model key") + tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) + output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) + model_key = MessageHandler.build_model_key( + "model key", app_feature_store.descriptor + ) request = MessageHandler.build_request( - test_dir, model_key, [tensor_key], [tensor_key], [], None + test_dir, model_key, [tensor_key], [output_key], [], None ) ser_request = MessageHandler.serialize_request(request) worker_manager._task_queue.send(ser_request) @@ -162,7 +191,11 @@ def mock_exception_handler(exc, reply_channel, failure_message): pytest.param( "fetch_model", "Failed while fetching the model.", id="fetch model" ), - pytest.param("load_model", "Failed while loading the model.", id="load model"), + pytest.param( + "load_model", + "Failed while loading model from feature store.", + id="load model", + ), pytest.param( "fetch_inputs", "Failed while fetching the inputs.", id="fetch inputs" ), diff --git a/tests/dragon/test_reply_building.py b/tests/dragon/test_reply_building.py index d1c4d226bb..5f179bbae0 100644 --- a/tests/dragon/test_reply_building.py +++ b/tests/dragon/test_reply_building.py @@ -30,10 +30,7 @@ dragon = pytest.importorskip("dragon") -from smartsim._core.mli.infrastructure.control.workermanager import ( - build_failure_reply, - build_reply, -) +from smartsim._core.mli.infrastructure.control.workermanager import build_failure_reply from smartsim._core.mli.infrastructure.worker.worker import InferenceReply if t.TYPE_CHECKING: @@ -63,29 +60,3 @@ def test_build_failure_reply_fails(): response = build_failure_reply("not a status enum", "message") assert "Error assigning status to response" in ex.value.args[0] - - -@pytest.mark.parametrize( - "status, message", - [ - pytest.param("complete", "Success", id="complete"), - ], -) -def test_build_reply(status: "Status", message: str): - "Ensures replies can be built successfully" - reply = InferenceReply() - reply.status_enum = status - reply.message = message - response = build_reply(reply) - assert response.status == status - assert response.message == message - - -def test_build_reply_fails(): - "Ensures ValueError is raised if a Status Enum is not used" - with pytest.raises(ValueError) as ex: - reply = InferenceReply() - reply.status_enum = "not a status enum" - response = build_reply(reply) - - assert "Error assigning status to response" in ex.value.args[0] diff --git a/tests/mli/test_worker_manager.py b/tests/dragon/test_worker_manager.py similarity index 77% rename from tests/mli/test_worker_manager.py rename to tests/dragon/test_worker_manager.py index df4b0a637f..864e14993c 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -35,46 +35,34 @@ torch = pytest.importorskip("torch") dragon = pytest.importorskip("dragon") +import base64 +import os + +import dragon.channels as dch +from dragon import fli + +from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.control.workermanager import ( EnvironmentConfigLoader, WorkerManager, ) +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( + DragonFeatureStore, +) from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger -from .channel import FileSystemCommChannel from .featurestore import FileSystemFeatureStore -from .worker import IntegratedTorchWorker +from .utils.channel import FileSystemCommChannel logger = get_logger(__name__) # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon -def mock_work(worker_manager_queue: "mp.Queue[bytes]") -> None: - """Mock event producer for triggering the inference pipeline""" - # todo: move to unit tests - while True: - time.sleep(1) - # 1. for demo, ignore upstream and just put stuff into downstream - # 2. for demo, only one downstream but we'd normally have to filter - # msg content and send to the correct downstream (worker) queue - timestamp = time.time_ns() - output_dir = "/lus/bnchlu1/mcbridch/code/ss/_tmp" - output_path = pathlib.Path(output_dir) - - mock_channel = output_path / f"brainstorm-{timestamp}.txt" - mock_model = output_path / "brainstorm.pt" - - output_path.mkdir(parents=True, exist_ok=True) - mock_channel.touch() - mock_model.touch() - - msg = f"PyTorch:{mock_model}:MockInputToReplace:{mock_channel}" - worker_manager_queue.put(msg.encode("utf-8")) - - def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: """Create a simple torch model and persist to disk for testing purposes. @@ -94,7 +82,7 @@ def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: def mock_messages( - worker_manager_queue: "mp.Queue[bytes]", + worker_manager_queue: CommChannelBase, feature_store: FeatureStore, feature_store_root_dir: pathlib.Path, comm_channel_root_dir: pathlib.Path, @@ -139,10 +127,11 @@ def mock_messages( tensor = torch.randn((1, 2), dtype=torch.float32) torch.save(tensor, buffer) feature_store[input_key] = buffer.getvalue() + fsd = feature_store.descriptor - message_tensor_output_key = MessageHandler.build_tensor_key(output_key) - message_tensor_input_key = MessageHandler.build_tensor_key(input_key) - message_model_key = MessageHandler.build_model_key(model_key) + message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) + message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd) + message_model_key = MessageHandler.build_model_key(model_key, fsd) request = MessageHandler.build_request( reply_channel=callback_channel.descriptor, @@ -153,7 +142,7 @@ def mock_messages( custom_attributes=None, ) request_bytes = MessageHandler.serialize_request(request) - worker_manager_queue.put(request_bytes) + worker_manager_queue.send(request_bytes) @pytest.fixture @@ -171,30 +160,49 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: fs_path = test_path / "feature_store" comm_path = test_path / "comm_store" - config_loader = EnvironmentConfigLoader() - integrated_worker = IntegratedTorchWorker() + to_worker_channel = dch.Channel.make_process_local() + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) + to_worker_fli_serialized = to_worker_fli.serialize() + + # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader + # or test environment may be unable to send messages w/queue + descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + os.environ["SS_REQUEST_QUEUE"] = descriptor + + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) + integrated_worker = TorchWorker() worker_manager = WorkerManager( config_loader, integrated_worker, as_service=True, - cooldown=10, - comm_channel_type=FileSystemCommChannel, + cooldown=5, + device="cpu", ) + worker_queue = config_loader.get_queue() + if worker_queue is None: + logger.warn( + f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}" + ) + # create a mock client application to populate the request queue msg_pump = mp.Process( target=mock_messages, args=( - config_loader.get_queue(), - config_loader.get_feature_store(), + worker_queue, + FileSystemFeatureStore(fs_path), fs_path, comm_path, ), ) msg_pump.start() - # # create a process to process commands + # create a process to execute commands process = mp.Process(target=worker_manager.execute) process.start() process.join(timeout=5) diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py index df76c484b5..08b659c072 100644 --- a/tests/dragon/utils/channel.py +++ b/tests/dragon/utils/channel.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pathlib +import threading import typing as t from smartsim._core.mli.comm.channel.channel import CommChannelBase @@ -37,7 +38,11 @@ class FileSystemCommChannel(CommChannelBase): """Passes messages by writing to a file""" def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: - """Initialize the FileSystemCommChannel instance""" + """Initialize the FileSystemCommChannel instance + + :param key: a path to the root directory of the feature store""" + self._lock = threading.RLock() + if not isinstance(key, bytes): super().__init__(key.as_posix().encode("utf-8")) self._file_path = key @@ -52,13 +57,38 @@ def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: def send(self, value: bytes) -> None: """Send a message throuh the underlying communication channel + :param value: The value to send""" logger.debug( f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}" ) - self._file_path.write_bytes(value) + with self._lock: + self._file_path.write_bytes(value) def recv(self) -> bytes: """Receieve a message through the underlying communication channel + :returns: the received message""" - ... + with self._lock: + if self._file_path.exists(): + incoming = self._file_path.read_bytes() + self._file_path.unlink() + return incoming + + @classmethod + def from_descriptor( + cls, + descriptor: t.Union[str, bytes], + ) -> "FileSystemCommChannel": + """A factory method that creates an instance from a descriptor string + + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached FileSystemCommChannel""" + try: + if isinstance(descriptor, str): + path = pathlib.Path(descriptor) + else: + path = pathlib.Path(descriptor.decode("utf-8")) + return FileSystemCommChannel(path) + except: + print("failed to create FS comm channel: {descriptor}") diff --git a/tests/dragon/utils/worker.py b/tests/dragon/utils/worker.py index b1de280185..0582cae566 100644 --- a/tests/dragon/utils/worker.py +++ b/tests/dragon/utils/worker.py @@ -47,7 +47,7 @@ class IntegratedTorchWorker(mliw.MachineLearningWorkerBase): @staticmethod def load_model( - request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult + request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult, device: str ) -> mliw.LoadModelResult: model_bytes = fetch_result.model_bytes or request.raw_model if not model_bytes: @@ -61,6 +61,7 @@ def load_model( def transform_input( request: mliw.InferenceRequest, fetch_result: mliw.FetchInputResult, + device: str, ) -> mliw.TransformInputResult: # extra metadata for assembly can be found in request.input_meta raw_inputs = request.raw_inputs or fetch_result.inputs @@ -93,36 +94,11 @@ def execute( def transform_output( request: mliw.InferenceRequest, execute_result: mliw.ExecuteResult, + result_device: str, ) -> mliw.TransformOutputResult: - # transformed = [item.clone() for item in execute_result.predictions] - # return OutputTransformResult(transformed) - - # transformed = [item.bytes() for item in execute_result.predictions] - - # OutputTransformResult.transformed SHOULD be a list of - # capnproto Tensors Or tensor descriptors accompanying bytes - # send the original tensors... execute_result.predictions = [t.detach() for t in execute_result.predictions] # todo: solve sending all tensor metadata that coincisdes with each prediction return mliw.TransformOutputResult( execute_result.predictions, [1], "c", "float32" ) - # return OutputTransformResult(transformed) - - # @staticmethod - # def serialize_reply( - # request: InferenceRequest, results: OutputTransformResult - # ) -> t.Any: - # # results = IntegratedTorchWorker._prepare_outputs(results.outputs) - # # return results - # return None - # # response = MessageHandler.build_response( - # # status=200, # todo: are we satisfied with 0/1 (success, fail) - # # # todo: if not detailed messages, this shouldn't be returned. - # # message="success", - # # result=results, - # # custom_attributes=None, - # # ) - # # serialized_resp = MessageHandler.serialize_response(response) - # # return serialized_resp diff --git a/tests/mli/channel.py b/tests/mli/channel.py index 4bc2014ea3..226e8683dd 100644 --- a/tests/mli/channel.py +++ b/tests/mli/channel.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pathlib +import threading import typing as t from smartsim._core.mli.comm.channel.channel import CommChannelBase @@ -37,7 +38,10 @@ class FileSystemCommChannel(CommChannelBase): """Passes messages by writing to a file""" def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: - """Initialize the FileSystemCommChannel instance""" + """Initialize the FileSystemCommChannel instance + + :param key: a path to the root directory of the feature store""" + self._lock = threading.RLock() if not isinstance(key, bytes): super().__init__(key.as_posix().encode("utf-8")) self._file_path = key @@ -52,8 +56,36 @@ def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: def send(self, value: bytes) -> None: """Send a message throuh the underlying communication channel + :param value: The value to send""" logger.debug( f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}" ) - self._file_path.write_bytes(value) + with self._lock: + self._file_path.write_bytes(value) + + def recv(self) -> bytes: + """Receieve a message through the underlying communication channel + + :returns: the received message""" + with self._lock: + if self._file_path.exists(): + incoming = self._file_path.read_bytes() + self._file_path.unlink() + return incoming + + @classmethod + def from_descriptor( + cls, + descriptor: str, + ) -> "FileSystemCommChannel": + """A factory method that creates an instance from a descriptor string + + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached FileSystemCommChannel""" + try: + path = pathlib.Path(descriptor) + return FileSystemCommChannel(path) + except: + print(f"failed to create fs comm channel: {descriptor}") + raise diff --git a/tests/mli/featurestore.py b/tests/mli/featurestore.py index 93b3134318..de748ae6e5 100644 --- a/tests/mli/featurestore.py +++ b/tests/mli/featurestore.py @@ -29,6 +29,9 @@ import smartsim.error as sse from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim.log import get_logger + +logger = get_logger(__name__) class MemoryFeatureStore(FeatureStore): @@ -40,6 +43,7 @@ def __init__(self) -> None: def __getitem__(self, key: str) -> bytes: """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" if key not in self._storage: raise sse.SmartSimError(f"{key} not found in feature store") @@ -47,8 +51,9 @@ def __getitem__(self, key: str) -> bytes: def __setitem__(self, key: str, value: bytes) -> None: """Membership operator to test for a key existing within the feature store. - Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" + + :param key: Unique key of an item to retrieve from the feature store + :returns: `True` if the key is found, `False` otherwise""" self._storage[key] = value def __contains__(self, key: str) -> bool: @@ -57,18 +62,31 @@ def __contains__(self, key: str) -> bool: :param key: Unique key of an item to retrieve from the feature store""" return key in self._storage + @property + def descriptor(self) -> str: + """Unique identifier enabling a client to connect to the feature store + + :returns: A descriptor encoded as a string""" + return "in-memory-fs" + class FileSystemFeatureStore(FeatureStore): """Alternative feature store implementation for testing. Stores all data on the file system""" - def __init__(self, storage_dir: t.Optional[pathlib.Path] = None) -> None: + def __init__( + self, storage_dir: t.Optional[t.Union[pathlib.Path, str]] = None + ) -> None: """Initialize the FileSystemFeatureStore instance + :param storage_dir: (optional) root directory to store all data relative to""" + if isinstance(storage_dir, str): + storage_dir = pathlib.Path(storage_dir) self._storage_dir = storage_dir def __getitem__(self, key: str) -> bytes: """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" path = self._key_path(key) if not path.exists(): @@ -77,6 +95,7 @@ def __getitem__(self, key: str) -> bytes: def __setitem__(self, key: str, value: bytes) -> None: """Assign a value using key + :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" path = self._key_path(key, create=True) @@ -84,14 +103,16 @@ def __setitem__(self, key: str, value: bytes) -> None: def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. - Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" + + :param key: Unique key of an item to retrieve from the feature store + :returns: `True` if the key is found, `False` otherwise""" path = self._key_path(key) return path.exists() def _key_path(self, key: str, create: bool = False) -> pathlib.Path: """Given a key, return a path that is optionally combined with a base directory used by the FileSystemFeatureStore. + :param key: Unique key of an item to retrieve from the feature store""" value = pathlib.Path(key) @@ -103,26 +124,32 @@ def _key_path(self, key: str, create: bool = False) -> pathlib.Path: return value - -class DragonDict: - """Mock implementation of a dragon dictionary""" - - def __init__(self) -> None: - """Initialize the mock DragonDict instance""" - self._storage: t.Dict[bytes, t.Any] = {} - - def __getitem__(self, key: bytes) -> t.Any: - """Retrieve an item using key - :param key: Unique key of an item to retrieve from the feature store""" - return self._storage[key] - - def __setitem__(self, key: bytes, value: t.Any) -> None: - """Assign a value using key - :param key: Unique key of an item to set in the feature store - :param value: Value to persist in the feature store""" - self._storage[key] = value - - def __contains__(self, key: bytes) -> bool: - """Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" - return key in self._storage + @property + def descriptor(self) -> str: + """Unique identifier enabling a client to connect to the feature store + + :returns: A descriptor encoded as a string""" + if not self._storage_dir: + raise ValueError("No storage path configured") + return self._storage_dir.as_posix() + + @classmethod + def from_descriptor( + cls, + descriptor: str, + ) -> "FileSystemFeatureStore": + """A factory method that creates an instance from a descriptor string + + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached FileSystemFeatureStore""" + try: + path = pathlib.Path(descriptor) + path.mkdir(parents=True, exist_ok=True) + if not path.is_dir(): + raise ValueError("FileSystemFeatureStore requires a directory path") + if not path.exists(): + path.mkdir(parents=True, exist_ok=True) + return FileSystemFeatureStore(path) + except: + logger.error(f"Error while creating FileSystemFeatureStore: {descriptor}") + raise diff --git a/tests/mli/test_core_machine_learning_worker.py b/tests/mli/test_core_machine_learning_worker.py index cff02c9c1c..6fa9f9944e 100644 --- a/tests/mli/test_core_machine_learning_worker.py +++ b/tests/mli/test_core_machine_learning_worker.py @@ -31,6 +31,7 @@ import torch import smartsim.error as sse +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStoreKey from smartsim._core.mli.infrastructure.worker.worker import ( InferenceRequest, MachineLearningWorkerCore, @@ -84,17 +85,18 @@ def persist_torch_tensor(test_dir: str) -> pathlib.Path: @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") -def test_fetch_model_disk(persist_torch_model: pathlib.Path) -> None: +def test_fetch_model_disk(persist_torch_model: pathlib.Path, test_dir: str) -> None: """Verify that the ML worker successfully retrieves a model when given a valid (file system) key""" worker = MachineLearningWorkerCore key = str(persist_torch_model) - feature_store = FileSystemFeatureStore() + feature_store = FileSystemFeatureStore(test_dir) + fsd = feature_store.descriptor feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes() - request = InferenceRequest(model_key=key) + request = InferenceRequest(model_key=FeatureStoreKey(key=key, descriptor=fsd)) - fetch_result = worker.fetch_model(request, feature_store) + fetch_result = worker.fetch_model(request, {fsd: feature_store}) assert fetch_result.model_bytes assert fetch_result.model_bytes == persist_torch_model.read_bytes() @@ -104,13 +106,14 @@ def test_fetch_model_disk_missing() -> None: when given an invalid (file system) key""" worker = MachineLearningWorkerCore feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor key = "/path/that/doesnt/exist" - request = InferenceRequest(model_key=key) + request = InferenceRequest(model_key=FeatureStoreKey(key=key, descriptor=fsd)) with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_model(request, feature_store) + worker.fetch_model(request, {fsd: feature_store}) # ensure the error message includes key-identifying information assert key in ex.value.args[0] @@ -127,10 +130,13 @@ def test_fetch_model_feature_store(persist_torch_model: pathlib.Path) -> None: # put model bytes into the feature store feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor feature_store[key] = persist_torch_model.read_bytes() - request = InferenceRequest(model_key=key) - fetch_result = worker.fetch_model(request, feature_store) + request = InferenceRequest( + model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + ) + fetch_result = worker.fetch_model(request, {fsd: feature_store}) assert fetch_result.model_bytes assert fetch_result.model_bytes == persist_torch_model.read_bytes() @@ -140,17 +146,20 @@ def test_fetch_model_feature_store_missing() -> None: when given an invalid (feature store) key""" worker = MachineLearningWorkerCore - bad_key = "some-key" + key = "some-key" feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor - request = InferenceRequest(model_key=bad_key) + request = InferenceRequest( + model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + ) # todo: consider that raising this exception shows impl. replace... with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_model(request, feature_store) + worker.fetch_model(request, {fsd: feature_store}) # ensure the error message includes key-identifying information - assert bad_key in ex.value.args[0] + assert key in ex.value.args[0] @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") @@ -161,11 +170,14 @@ def test_fetch_model_memory(persist_torch_model: pathlib.Path) -> None: key = "test-model" feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor feature_store[key] = persist_torch_model.read_bytes() - request = InferenceRequest(model_key=key) + request = InferenceRequest( + model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + ) - fetch_result = worker.fetch_model(request, feature_store) + fetch_result = worker.fetch_model(request, {fsd: feature_store}) assert fetch_result.model_bytes assert fetch_result.model_bytes == persist_torch_model.read_bytes() @@ -176,13 +188,16 @@ def test_fetch_input_disk(persist_torch_tensor: pathlib.Path) -> None: when given a valid (file system) key""" tensor_name = str(persist_torch_tensor) - request = InferenceRequest(input_keys=[tensor_name]) + feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor + request = InferenceRequest( + input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)] + ) worker = MachineLearningWorkerCore - feature_store = MemoryFeatureStore() feature_store[tensor_name] = persist_torch_tensor.read_bytes() - fetch_result = worker.fetch_inputs(request, feature_store) + fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) assert fetch_result.inputs is not None @@ -191,16 +206,17 @@ def test_fetch_input_disk_missing() -> None: when given an invalid (file system) key""" worker = MachineLearningWorkerCore - key = "/path/that/doesnt/exist" feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor + key = "/path/that/doesnt/exist" - request = InferenceRequest(input_keys=[key]) + request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_inputs(request, feature_store) + worker.fetch_inputs(request, {fsd: feature_store}) # ensure the error message includes key-identifying information - assert key in ex.value.args[0] + assert key[0] in ex.value.args[0] @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") @@ -211,13 +227,16 @@ def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None: tensor_name = "test-tensor" feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor - request = InferenceRequest(input_keys=[tensor_name]) + request = InferenceRequest( + input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)] + ) # put model bytes into the feature store feature_store[tensor_name] = persist_torch_tensor.read_bytes() - fetch_result = worker.fetch_inputs(request, feature_store) + fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) assert fetch_result.inputs assert list(fetch_result.inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10] @@ -230,6 +249,7 @@ def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) -> tensor_name = "test-tensor" feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor # put model bytes into the feature store body1 = persist_torch_tensor.read_bytes() @@ -242,10 +262,14 @@ def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) -> feature_store[tensor_name + "3"] = body3 request = InferenceRequest( - input_keys=[tensor_name + "1", tensor_name + "2", tensor_name + "3"] + input_keys=[ + FeatureStoreKey(key=tensor_name + "1", descriptor=fsd), + FeatureStoreKey(key=tensor_name + "2", descriptor=fsd), + FeatureStoreKey(key=tensor_name + "3", descriptor=fsd), + ] ) - fetch_result = worker.fetch_inputs(request, feature_store) + fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) raw_bytes = list(fetch_result.inputs) assert raw_bytes @@ -259,15 +283,16 @@ def test_fetch_input_feature_store_missing() -> None: when given an invalid (feature store) key""" worker = MachineLearningWorkerCore - bad_key = "some-key" + key = "bad-key" feature_store = MemoryFeatureStore() - request = InferenceRequest(input_keys=[bad_key]) + fsd = feature_store.descriptor + request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_inputs(request, feature_store) + worker.fetch_inputs(request, {fsd: feature_store}) # ensure the error message includes key-identifying information - assert bad_key in ex.value.args[0] + assert key in ex.value.args[0] @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") @@ -276,12 +301,13 @@ def test_fetch_input_memory(persist_torch_tensor: pathlib.Path) -> None: when given a valid (file system) key""" worker = MachineLearningWorkerCore feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor - model_name = "test-model" - feature_store[model_name] = persist_torch_tensor.read_bytes() - request = InferenceRequest(input_keys=[model_name]) + key = "test-model" + feature_store[key] = persist_torch_tensor.read_bytes() + request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) - fetch_result = worker.fetch_inputs(request, feature_store) + fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) assert fetch_result.inputs is not None @@ -304,18 +330,23 @@ def test_place_outputs() -> None: key_name = "test-model" feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor # create a key to retrieve from the feature store - keys = [key_name + "1", key_name + "2", key_name + "3"] + keys = [ + FeatureStoreKey(key=key_name + "1", descriptor=fsd), + FeatureStoreKey(key=key_name + "2", descriptor=fsd), + FeatureStoreKey(key=key_name + "3", descriptor=fsd), + ] data = [b"abcdef", b"ghijkl", b"mnopqr"] - for k, v in zip(keys, data): - feature_store[k] = v + for fsk, v in zip(keys, data): + feature_store[fsk.key] = v request = InferenceRequest(output_keys=keys) transform_result = TransformOutputResult(data, [1], "c", "float32") - worker.place_output(request, transform_result, feature_store) + worker.place_output(request, transform_result, {fsd: feature_store}) for i in range(3): - assert feature_store[keys[i]] == data[i] + assert feature_store[keys[i].key] == data[i] diff --git a/tests/mli/test_torch_worker.py b/tests/mli/test_torch_worker.py index b73e4a31b5..1e8bba7e33 100644 --- a/tests/mli/test_torch_worker.py +++ b/tests/mli/test_torch_worker.py @@ -26,12 +26,12 @@ import io -import numpy as np import pytest import torch from torch import nn from torch.nn import functional as F +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStoreKey from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, @@ -102,7 +102,7 @@ def get_request() -> InferenceRequest: ] return InferenceRequest( - model_key="model", + model_key=FeatureStoreKey(key="model", descriptor="xyz"), callback=None, raw_inputs=tensor_numpy, input_keys=None, diff --git a/tests/mli/worker.py b/tests/mli/worker.py index b1de280185..0582cae566 100644 --- a/tests/mli/worker.py +++ b/tests/mli/worker.py @@ -47,7 +47,7 @@ class IntegratedTorchWorker(mliw.MachineLearningWorkerBase): @staticmethod def load_model( - request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult + request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult, device: str ) -> mliw.LoadModelResult: model_bytes = fetch_result.model_bytes or request.raw_model if not model_bytes: @@ -61,6 +61,7 @@ def load_model( def transform_input( request: mliw.InferenceRequest, fetch_result: mliw.FetchInputResult, + device: str, ) -> mliw.TransformInputResult: # extra metadata for assembly can be found in request.input_meta raw_inputs = request.raw_inputs or fetch_result.inputs @@ -93,36 +94,11 @@ def execute( def transform_output( request: mliw.InferenceRequest, execute_result: mliw.ExecuteResult, + result_device: str, ) -> mliw.TransformOutputResult: - # transformed = [item.clone() for item in execute_result.predictions] - # return OutputTransformResult(transformed) - - # transformed = [item.bytes() for item in execute_result.predictions] - - # OutputTransformResult.transformed SHOULD be a list of - # capnproto Tensors Or tensor descriptors accompanying bytes - # send the original tensors... execute_result.predictions = [t.detach() for t in execute_result.predictions] # todo: solve sending all tensor metadata that coincisdes with each prediction return mliw.TransformOutputResult( execute_result.predictions, [1], "c", "float32" ) - # return OutputTransformResult(transformed) - - # @staticmethod - # def serialize_reply( - # request: InferenceRequest, results: OutputTransformResult - # ) -> t.Any: - # # results = IntegratedTorchWorker._prepare_outputs(results.outputs) - # # return results - # return None - # # response = MessageHandler.build_response( - # # status=200, # todo: are we satisfied with 0/1 (success, fail) - # # # todo: if not detailed messages, this shouldn't be returned. - # # message="success", - # # result=results, - # # custom_attributes=None, - # # ) - # # serialized_resp = MessageHandler.serialize_response(response) - # # return serialized_resp diff --git a/tests/test_dragon_run_policy.py b/tests/test_dragon_run_policy.py index 1d8d069fab..c94ae375b4 100644 --- a/tests/test_dragon_run_policy.py +++ b/tests/test_dragon_run_policy.py @@ -143,7 +143,6 @@ def test_create_run_policy_run_request_no_run_policy() -> None: assert policy.device == Policy.Device.DEFAULT assert set(policy.cpu_affinity) == set() assert policy.gpu_affinity == [] - assert policy.affinity == Policy.Affinity.DEFAULT @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @@ -167,7 +166,6 @@ def test_create_run_policy_run_request_default_run_policy() -> None: assert set(policy.cpu_affinity) == set() assert set(policy.gpu_affinity) == set() - assert policy.affinity == Policy.Affinity.DEFAULT @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @@ -192,7 +190,6 @@ def test_create_run_policy_run_request_cpu_affinity_no_device() -> None: assert set(policy.cpu_affinity) == affinity assert policy.gpu_affinity == [] - assert policy.affinity == Policy.Affinity.SPECIFIC @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @@ -216,7 +213,6 @@ def test_create_run_policy_run_request_cpu_affinity() -> None: assert set(policy.cpu_affinity) == affinity assert policy.gpu_affinity == [] - assert policy.affinity == Policy.Affinity.SPECIFIC @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @@ -240,7 +236,6 @@ def test_create_run_policy_run_request_gpu_affinity() -> None: assert policy.cpu_affinity == [] assert set(policy.gpu_affinity) == set(affinity) - assert policy.affinity == Policy.Affinity.SPECIFIC @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") diff --git a/tests/test_message_handler/test_build_model_key.py b/tests/test_message_handler/test_build_model_key.py index 135e967983..c09c787fcf 100644 --- a/tests/test_message_handler/test_build_model_key.py +++ b/tests/test_message_handler/test_build_model_key.py @@ -35,10 +35,13 @@ def test_build_model_key_successful(): - model_key = handler.build_model_key("tensor_key") + fsd = "mock-feature-store-descriptor" + model_key = handler.build_model_key("tensor_key", fsd) assert model_key.key == "tensor_key" + assert model_key.featureStoreDescriptor == fsd def test_build_model_key_unsuccessful(): with pytest.raises(ValueError): - model_key = handler.build_model_key(100) + fsd = "mock-feature-store-descriptor" + model_key = handler.build_model_key(100, fsd) diff --git a/tests/test_message_handler/test_build_tensor_key.py b/tests/test_message_handler/test_build_tensor_key.py index 7abe9e853d..6a28b80c4f 100644 --- a/tests/test_message_handler/test_build_tensor_key.py +++ b/tests/test_message_handler/test_build_tensor_key.py @@ -35,10 +35,12 @@ def test_build_tensor_key_successful(): - tensor_key = handler.build_tensor_key("tensor_key") + fsd = "mock-feature-store-descriptor" + tensor_key = handler.build_tensor_key("tensor_key", fsd) assert tensor_key.key == "tensor_key" def test_build_tensor_key_unsuccessful(): with pytest.raises(ValueError): - tensor_key = handler.build_tensor_key(100) + fsd = "mock-feature-store-descriptor" + tensor_key = handler.build_tensor_key(100, fsd) diff --git a/tests/test_message_handler/test_output_descriptor.py b/tests/test_message_handler/test_output_descriptor.py index fd21eeb0d5..beb9a47657 100644 --- a/tests/test_message_handler/test_output_descriptor.py +++ b/tests/test_message_handler/test_output_descriptor.py @@ -33,7 +33,8 @@ handler = MessageHandler() -tensor_key = handler.build_tensor_key("key") +fsd = "mock-feature-store-descriptor" +tensor_key = handler.build_tensor_key("key", fsd) @pytest.mark.parametrize( diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py index 4cfc115845..ea9b04d649 100644 --- a/tests/test_message_handler/test_request.py +++ b/tests/test_message_handler/test_request.py @@ -31,14 +31,16 @@ # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a -model_key = MessageHandler.build_model_key("model_key") +fsd = "mock-feature-store-descriptor" + +model_key = MessageHandler.build_model_key("model_key", fsd) model = MessageHandler.build_model(b"model data", "model_name", "v0.0.1") -input_key1 = MessageHandler.build_tensor_key("input_key1") -input_key2 = MessageHandler.build_tensor_key("input_key2") +input_key1 = MessageHandler.build_tensor_key("input_key1", fsd) +input_key2 = MessageHandler.build_tensor_key("input_key2", fsd) -output_key1 = MessageHandler.build_tensor_key("output_key1") -output_key2 = MessageHandler.build_tensor_key("output_key2") +output_key1 = MessageHandler.build_tensor_key("output_key1", fsd) +output_key2 = MessageHandler.build_tensor_key("output_key2", fsd) output_descriptor1 = MessageHandler.build_output_tensor_descriptor( "c", [output_key1, output_key2], "int64", [] diff --git a/tests/test_message_handler/test_response.py b/tests/test_message_handler/test_response.py index 03bd9ba73f..d6894eb5cc 100644 --- a/tests/test_message_handler/test_response.py +++ b/tests/test_message_handler/test_response.py @@ -31,9 +31,10 @@ # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a +fsd = "mock-feature-store-descriptor" -result_key1 = MessageHandler.build_tensor_key("result_key1") -result_key2 = MessageHandler.build_tensor_key("result_key2") +result_key1 = MessageHandler.build_tensor_key("result_key1", fsd) +result_key2 = MessageHandler.build_tensor_key("result_key2", fsd) torch_attributes = MessageHandler.build_torch_response_attributes() tf_attributes = MessageHandler.build_tf_response_attributes() From 74d6e78c99a4bb528c7ac8ee725d45083595d5a8 Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Thu, 8 Aug 2024 14:05:23 -0700 Subject: [PATCH 15/60] Use `torch.from_numpy` instead of `torch.tensor` to reduce a copy (#661) Reduce copies by using `torch.from_numpy`. --- doc/changelog.md | 1 + ex/high_throughput_inference/mock_app.py | 8 +++----- smartsim/_core/mli/infrastructure/worker/torch_worker.py | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/changelog.md b/doc/changelog.md index fcd5ec215e..80dd23cf4e 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Reduce a copy by using torch.from_numpy instead of torch.tensor - Enable dynamic feature store selection - Fix dragon package installation bug - Adjust schemas for better performance diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 3a5169a668..7221ee36f1 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -157,11 +157,9 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.measure_time("deserialize_response") # list of data blobs? recv depending on the len(response.result.descriptors)? data_blob = from_recvh.recv_bytes(timeout=None) - result = torch.from_numpy( - numpy.frombuffer( - data_blob, - dtype=str(response.result.descriptors[0].dataType), - ) + result = numpy.frombuffer( + data_blob, + dtype=str(response.result.descriptors[0].dataType), ) self.measure_time("deserialize_tensor") diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index e732ecd2cd..eea349894c 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -80,7 +80,7 @@ def transform_input( for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): tensor_desc: tensor_capnp.TensorDescriptor = item_meta result.append( - torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType))) + torch.from_numpy(np.frombuffer(item, dtype=str(tensor_desc.dataType))) .to(device) .reshape(tuple(dim for dim in tensor_desc.dimensions)) ) From 391784c95607fb7c0a4b60d72c01cc1dd0968ff7 Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Wed, 14 Aug 2024 09:27:20 -0700 Subject: [PATCH 16/60] MLI environment variables updated using new naming convention (#665) `SS_INFRA_BACKBONE` has been updated to `_SMARTSIM_INFRA_BACKBONE` and `SS_REQUEST_QUEUE` is now `_SMARTSIM_REQUEST_QUEUE`. [ committed by @AlyssaCote ] [ reviewed by @mellis13 ] --- doc/changelog.md | 1 + ex/high_throughput_inference/mock_app.py | 2 +- .../standalone_workermanager.py | 4 ++-- smartsim/_core/launcher/dragon/dragonBackend.py | 2 +- .../_core/mli/infrastructure/environmentloader.py | 8 ++++---- tests/dragon/test_environment_loader.py | 12 ++++++++---- tests/dragon/test_error_handling.py | 12 ++++++++---- tests/dragon/test_worker_manager.py | 2 +- 8 files changed, 26 insertions(+), 17 deletions(-) diff --git a/doc/changelog.md b/doc/changelog.md index 80dd23cf4e..18a0fed70f 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Update MLI environment variables using new naming convention - Reduce a copy by using torch.from_numpy instead of torch.tensor - Enable dynamic feature store selection - Fix dragon package installation bug diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 7221ee36f1..44db70b71d 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -56,7 +56,7 @@ class ProtoClient: def __init__(self, timing_on: bool): connect_to_infrastructure() - ddict_str = os.environ["SS_INFRA_BACKBONE"] + ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] self._ddict = DDict.attach(ddict_str) self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor to_worker_fli_str = None diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 2b5ba7df42..982cb6cc38 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -69,7 +69,7 @@ args = parser.parse_args() connect_to_infrastructure() - ddict_str = os.environ["SS_INFRA_BACKBONE"] + ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] ddict = DDict.attach(ddict_str) to_worker_channel = Channel.make_process_local() @@ -81,7 +81,7 @@ torch_worker = cloudpickle.loads(worker_type_name)() descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") - os.environ["SS_REQUEST_QUEUE"] = descriptor + os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 4fe6d55ad6..daf18e2cb9 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -515,7 +515,7 @@ def _start_steps(self) -> None: env={ **request.current_env, **request.env, - "SS_INFRA_BACKBONE": self.infra_ddict, + "_SMARTSIM_INFRA_BACKBONE": self.infra_ddict, }, stdout=dragon_process.Popen.PIPE, stderr=dragon_process.Popen.PIPE, diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index b4b9e565ce..99202ef2ea 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -72,8 +72,8 @@ def get_backbone(self) -> t.Optional[FeatureStore]: an environment variable. The backbone is a standalone, system-created feature store used to share internal information among MLI components - :returns: The attached feature store via SS_INFRA_BACKBONE""" - descriptor = os.getenv("SS_INFRA_BACKBONE", "") + :returns: The attached feature store via _SMARTSIM_INFRA_BACKBONE""" + descriptor = os.getenv("_SMARTSIM_INFRA_BACKBONE", "") if not descriptor: logger.warning("No backbone descriptor is configured") @@ -90,8 +90,8 @@ def get_queue(self) -> t.Optional[CommChannelBase]: """Attach to a queue-like communication channel using the descriptor found in an environment variable. - :returns: The attached queue specified via `SS_REQUEST_QUEUE`""" - descriptor = os.getenv("SS_REQUEST_QUEUE", "") + :returns: The attached queue specified via `_SMARTSIM_REQUEST_QUEUE`""" + descriptor = os.getenv("_SMARTSIM_REQUEST_QUEUE", "") if not descriptor: logger.warning("No queue descriptor is configured") diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index 6ae5d2b301..8f2716488b 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -55,7 +55,9 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke """A descriptor can be stored, loaded, and reattached""" chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())) + monkeypatch.setenv( + "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + ) config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -76,7 +78,9 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): queue are the same""" chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())) + monkeypatch.setenv( + "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + ) config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -89,7 +93,7 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): """An incorrect serialized descriptor will fails to attach""" - monkeypatch.setenv("SS_REQUEST_QUEUE", "randomstring") + monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring") config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=None, @@ -104,7 +108,7 @@ def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch): """Verify the dragon feature store is loaded correctly by the EnvironmentConfigLoader to demonstrate featurestore_factory correctness""" feature_store = DragonFeatureStore(DDict()) - monkeypatch.setenv("SS_INFRA_BACKBONE", feature_store.descriptor) + monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor) config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 208ab1e5e9..5603269b2f 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -89,9 +89,11 @@ def setup_worker_manager_model_bytes( chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())) + monkeypatch.setenv( + "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + ) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` - monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor) + monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) worker_manager = WorkerManager( EnvironmentConfigLoader( @@ -127,9 +129,11 @@ def setup_worker_manager_model_key( chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())) + monkeypatch.setenv( + "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + ) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` - monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor) + monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) worker_manager = WorkerManager( EnvironmentConfigLoader( diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index 864e14993c..c8332c260f 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -167,7 +167,7 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader # or test environment may be unable to send messages w/queue descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") - os.environ["SS_REQUEST_QUEUE"] = descriptor + os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, From f7ef49b798b6197d7172539339d884bec7664250 Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Tue, 20 Aug 2024 14:23:47 -0700 Subject: [PATCH 17/60] Remove pydantic dependency from MLI code (#667) Converted `FeatureStoreKey` into a frozen dataclass and used `_post_init_` to validate that the key and descriptor are not empty strings. [ committed by @AlyssaCote ] [ approved by @ankona ] --- doc/changelog.md | 1 + .../infrastructure/storage/featurestore.py | 20 ++++++++++++++----- .../mli/test_core_machine_learning_worker.py | 12 +++++++++++ 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/doc/changelog.md b/doc/changelog.md index 18a0fed70f..9240efbc8a 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Remove pydantic dependency from MLI code - Update MLI environment variables using new naming convention - Reduce a copy by using torch.from_numpy instead of torch.tensor - Enable dynamic feature store selection diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py index d511d588e1..31e3866e70 100644 --- a/smartsim/_core/mli/infrastructure/storage/featurestore.py +++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py @@ -26,22 +26,32 @@ import typing as t from abc import ABC, abstractmethod - -from pydantic import BaseModel, Field +from dataclasses import dataclass from smartsim.log import get_logger logger = get_logger(__name__) -class FeatureStoreKey(BaseModel): +@dataclass(frozen=True) +class FeatureStoreKey: """A key,descriptor pair enabling retrieval of an item from a feature store""" - key: str = Field(min_length=1) + key: str """The unique key of an item in a feature store""" - descriptor: str = Field(min_length=1) + descriptor: str """The unique identifier of the feature store containing the key""" + def __post_init__(self) -> None: + """Ensure the key and descriptor have at least one character + + :raises ValueError: if key or descriptor are empty strings + """ + if len(self.key) < 1: + raise ValueError("Key must have at least one character.") + if len(self.descriptor) < 1: + raise ValueError("Descriptor must have at least one character.") + class FeatureStore(ABC): """Abstract base class providing the common interface for retrieving diff --git a/tests/mli/test_core_machine_learning_worker.py b/tests/mli/test_core_machine_learning_worker.py index 6fa9f9944e..7ef4ab259b 100644 --- a/tests/mli/test_core_machine_learning_worker.py +++ b/tests/mli/test_core_machine_learning_worker.py @@ -350,3 +350,15 @@ def test_place_outputs() -> None: for i in range(3): assert feature_store[keys[i].key] == data[i] + + +@pytest.mark.parametrize( + "key, descriptor", + [ + pytest.param("", "desc", id="invalid key"), + pytest.param("key", "", id="invalid descriptor"), + ], +) +def test_invalid_featurestorekey(key, descriptor) -> None: + with pytest.raises(ValueError): + fsk = FeatureStoreKey(key, descriptor) From ef034d569ef736e4ab524dc140d002057228cca1 Mon Sep 17 00:00:00 2001 From: Chris McBride <3595025+ankona@users.noreply.github.com> Date: Sun, 25 Aug 2024 23:47:01 -0400 Subject: [PATCH 18/60] Enable specification of target hostname for a dragon task (#660) ## Description This PR adds two features: 1. Ability to specify hostnames that tasks should run on 2. Enable tasks colocation ### Specifying Hostnames The existing `DragonRunRequest` supported the ability to specify a hostname when creating a policy used to run a task. However, the hostnames were not exposed to clients. This ticket allows clients to pass a list of hosts that will be used in place of the default "first available host" behavior. ### Task Colocation The prior system for finding nodes to execute a task worked worked only with unassigned nodes. Any node assigned a task could not be assigned another task. This ticket adds a more capable prioritizer class that enables clients using hostnames to colocate tasks. It also retains the capability to return open nodes when no hostname is specified. --- doc/changelog.md | 1 + .../_core/launcher/dragon/dragonBackend.py | 225 +++++-- .../_core/launcher/dragon/dragonLauncher.py | 2 + smartsim/_core/launcher/dragon/pqueue.py | 467 +++++++++++++++ smartsim/_core/launcher/step/dragonStep.py | 2 + smartsim/settings/dragonRunSettings.py | 20 + tests/test_dragon_run_request.py | 341 ++++++----- tests/test_dragon_runsettings.py | 119 ++++ tests/test_dragon_step.py | 13 + tests/test_node_prioritizer.py | 555 ++++++++++++++++++ 10 files changed, 1542 insertions(+), 203 deletions(-) create mode 100644 smartsim/_core/launcher/dragon/pqueue.py create mode 100644 tests/test_node_prioritizer.py diff --git a/doc/changelog.md b/doc/changelog.md index 9240efbc8a..964e62b49d 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Enable hostname selection for dragon tasks - Remove pydantic dependency from MLI code - Update MLI environment variables using new naming convention - Reduce a copy by using torch.from_numpy instead of torch.tensor diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index daf18e2cb9..2fda876462 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -45,6 +45,8 @@ import dragon.native.process_group as dragon_process_group import dragon.native.machine as dragon_machine +from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter + # pylint: enable=import-error # isort: on from ...._core.config import get_config @@ -190,6 +192,18 @@ def __init__(self, pid: int) -> None: self._view = DragonBackendView(self) logger.debug(self._view.host_desc) self._infra_ddict: t.Optional[dragon_ddict.DDict] = None + self._prioritizer = NodePrioritizer(self._nodes, self._queue_lock) + + self._nodes: t.List["dragon_machine.Node"] = [] + """Node capability information for hosts in the allocation""" + self._hosts: t.List[str] = [] + """List of hosts available in allocation""" + self._cpus: t.List[int] = [] + """List of cpu-count by node""" + self._gpus: t.List[int] = [] + """List of gpu-count by node""" + self._allocated_hosts: t.Dict[str, t.Set[str]] = {} + """Mapping with hostnames as keys and a set of running step IDs as the value""" @property def hosts(self) -> list[str]: @@ -197,34 +211,39 @@ def hosts(self) -> list[str]: return self._hosts @property - def allocated_hosts(self) -> dict[str, str]: + def allocated_hosts(self) -> dict[str, t.Set[str]]: + """A map of host names to the step id executing on a host + + :returns: Dictionary with host name as key and step id as value""" with self._queue_lock: return self._allocated_hosts @property - def free_hosts(self) -> t.Deque[str]: + def free_hosts(self) -> t.Sequence[str]: + """Find hosts that do not have a step assigned + + :returns: List of host names""" with self._queue_lock: - return self._free_hosts + return list(map(lambda x: x.hostname, self._prioritizer.unassigned())) @property def group_infos(self) -> dict[str, ProcessGroupInfo]: + """Find information pertaining to process groups executing on a host + + :returns: Dictionary with host name as key and group information as value""" with self._queue_lock: return self._group_infos def _initialize_hosts(self) -> None: + """Prepare metadata about the allocation""" with self._queue_lock: self._nodes = [ dragon_machine.Node(node) for node in dragon_machine.System().nodes ] - self._hosts: t.List[str] = sorted(node.hostname for node in self._nodes) + self._hosts = sorted(node.hostname for node in self._nodes) self._cpus = [node.num_cpus for node in self._nodes] self._gpus = [node.num_gpus for node in self._nodes] - - """List of hosts available in allocation""" - self._free_hosts: t.Deque[str] = collections.deque(self._hosts) - """List of hosts on which steps can be launched""" - self._allocated_hosts: t.Dict[str, str] = {} - """Mapping of hosts on which a step is already running to step ID""" + self._allocated_hosts = collections.defaultdict(set) def __str__(self) -> str: return self.status_message @@ -233,7 +252,7 @@ def __str__(self) -> str: def status_message(self) -> str: """Message with status of available nodes and history of launched jobs. - :returns: Status message + :returns: a status message """ return ( "Dragon server backend update\n" @@ -245,9 +264,8 @@ def _heartbeat(self) -> None: @property def cooldown_period(self) -> int: - """Time (in seconds) the server will wait before shutting down - - when exit conditions are met (see ``should_shutdown()`` for further details). + """Time (in seconds) the server will wait before shutting down when + exit conditions are met (see ``should_shutdown()`` for further details). """ return self._cooldown_period @@ -281,6 +299,8 @@ def should_shutdown(self) -> bool: and it requested immediate shutdown, or if it did not request immediate shutdown, but all jobs have been executed. In both cases, a cooldown period may need to be waited before shutdown. + + :returns: `True` if the server should terminate, otherwise `False` """ if self._shutdown_requested and self._can_shutdown: return self._has_cooled_down @@ -288,7 +308,9 @@ def should_shutdown(self) -> bool: @property def current_time(self) -> float: - """Current time for DragonBackend object, in seconds since the Epoch""" + """Current time for DragonBackend object, in seconds since the Epoch + + :returns: the current timestamp""" return time.time() def _can_honor_policy( @@ -296,63 +318,149 @@ def _can_honor_policy( ) -> t.Tuple[bool, t.Optional[str]]: """Check if the policy can be honored with resources available in the allocation. - :param request: DragonRunRequest containing policy information + + :param request: `DragonRunRequest` to validate :returns: Tuple indicating if the policy can be honored and an optional error message""" # ensure the policy can be honored if request.policy: + logger.debug(f"{request.policy=}{self._cpus=}{self._gpus=}") + if request.policy.cpu_affinity: # make sure some node has enough CPUs - available = max(self._cpus) + last_available = max(self._cpus or [-1]) requested = max(request.policy.cpu_affinity) - - if requested >= available: + if not any(self._cpus) or requested >= last_available: return False, "Cannot satisfy request, not enough CPUs available" - if request.policy.gpu_affinity: # make sure some node has enough GPUs - available = max(self._gpus) + last_available = max(self._gpus or [-1]) requested = max(request.policy.gpu_affinity) - - if requested >= available: + if not any(self._gpus) or requested >= last_available: + logger.warning( + f"failed check w/{self._gpus=}, {requested=}, {last_available=}" + ) return False, "Cannot satisfy request, not enough GPUs available" - return True, None def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str]]: - """Check if request can be honored with resources available in the allocation. - - Currently only checks for total number of nodes, - in the future it will also look at other constraints - such as memory, accelerators, and so on. + """Check if request can be honored with resources available in + the allocation. Currently only checks for total number of nodes, + in the future it will also look at other constraints such as memory, + accelerators, and so on. + + :param request: `DragonRunRequest` to validate + :returns: Tuple indicating if the request can be honored and + an optional error message """ - if request.nodes > len(self._hosts): - message = f"Cannot satisfy request. Requested {request.nodes} nodes, " - message += f"but only {len(self._hosts)} nodes are available." - return False, message - if self._shutdown_requested: - message = "Cannot satisfy request, server is shutting down." - return False, message + honorable, err = self._can_honor_state(request) + if not honorable: + return False, err honorable, err = self._can_honor_policy(request) if not honorable: return False, err + honorable, err = self._can_honor_hosts(request) + if not honorable: + return False, err + + return True, None + + def _can_honor_hosts( + self, request: DragonRunRequest + ) -> t.Tuple[bool, t.Optional[str]]: + """Check if the current state of the backend process inhibits executing + the request. + + :param request: `DragonRunRequest` to validate + :returns: Tuple indicating if the request can be honored and + an optional error message""" + all_hosts = frozenset(self._hosts) + num_nodes = request.nodes + + # fail if requesting more nodes than the total number available + if num_nodes > len(all_hosts): + message = f"Cannot satisfy request. {num_nodes} requested nodes" + message += f" exceeds {len(all_hosts)} available." + return False, message + + requested_hosts = all_hosts + if request.hostlist: + requested_hosts = frozenset( + {host.strip() for host in request.hostlist.split(",")} + ) + + valid_hosts = all_hosts.intersection(requested_hosts) + invalid_hosts = requested_hosts - valid_hosts + + logger.debug(f"{num_nodes=}{valid_hosts=}{invalid_hosts=}") + + if invalid_hosts: + logger.warning(f"Some invalid hostnames were requested: {invalid_hosts}") + + # fail if requesting specific hostnames and there aren't enough available + if num_nodes > len(valid_hosts): + message = f"Cannot satisfy request. Requested {num_nodes} nodes, " + message += f"but only {len(valid_hosts)} named hosts are available." + return False, message + + return True, None + + def _can_honor_state( + self, _request: DragonRunRequest + ) -> t.Tuple[bool, t.Optional[str]]: + """Check if the current state of the backend process inhibits executing + the request. + :param _request: the DragonRunRequest to verify + :returns: Tuple indicating if the request can be honored and + an optional error message""" + if self._shutdown_requested: + message = "Cannot satisfy request, server is shutting down." + return False, message + return True, None def _allocate_step( self, step_id: str, request: DragonRunRequest ) -> t.Optional[t.List[str]]: + """Identify the hosts on which the request will be executed + :param step_id: The identifier of a step that will be executed on the host + :param request: The request to be executed + :returns: A list of selected hostnames""" + # ensure at least one host is selected num_hosts: int = request.nodes + with self._queue_lock: - if num_hosts <= 0 or num_hosts > len(self._free_hosts): + if num_hosts <= 0 or num_hosts > len(self._hosts): + logger.debug( + f"The number of requested hosts ({num_hosts}) is invalid or" + f" cannot be satisfied with {len(self._hosts)} available nodes" + ) return None - to_allocate = [] - for _ in range(num_hosts): - host = self._free_hosts.popleft() - self._allocated_hosts[host] = step_id - to_allocate.append(host) + + hosts = [] + if request.hostlist: + # convert the comma-separated argument into a real list + hosts = [host for host in request.hostlist.split(",") if host] + + filter_on: t.Optional[PrioritizerFilter] = None + if request.policy and request.policy.gpu_affinity: + filter_on = PrioritizerFilter.GPU + + nodes = self._prioritizer.next_n(num_hosts, filter_on, step_id, hosts) + + if len(nodes) < num_hosts: + # exit if the prioritizer can't identify enough nodes + return None + + to_allocate = [node.hostname for node in nodes] + + for hostname in to_allocate: + # track assigning this step to each node + self._allocated_hosts[hostname].add(step_id) + return to_allocate @staticmethod @@ -392,6 +500,7 @@ def _create_redirect_workers( return grp_redir def _stop_steps(self) -> None: + """Trigger termination of all currently executing steps""" self._heartbeat() with self._queue_lock: while len(self._stop_requests) > 0: @@ -451,6 +560,7 @@ def create_run_policy( request: DragonRequest, node_name: str ) -> "dragon_policy.Policy": """Create a dragon Policy from the request and node name + :param request: DragonRunRequest containing policy information :param node_name: Name of the node on which the process will run :returns: dragon_policy.Policy object mapped from request properties""" @@ -586,9 +696,11 @@ def _start_steps(self) -> None: logger.error(e) def _refresh_statuses(self) -> None: + """Query underlying management system for step status and update + stored assigned and unassigned task information""" self._heartbeat() with self._queue_lock: - terminated = [] + terminated: t.Set[str] = set() for step_id in self._running_steps: group_info = self._group_infos[step_id] grp = group_info.process_group @@ -622,11 +734,15 @@ def _refresh_statuses(self) -> None: ) if group_info.status in TERMINAL_STATUSES: - terminated.append(step_id) + terminated.add(step_id) if terminated: logger.debug(f"{terminated=}") + # remove all the terminated steps from all hosts + for host in list(self._allocated_hosts.keys()): + self._allocated_hosts[host].difference_update(terminated) + for step_id in terminated: self._running_steps.remove(step_id) self._completed_steps.append(step_id) @@ -634,11 +750,13 @@ def _refresh_statuses(self) -> None: if group_info is not None: for host in group_info.hosts: logger.debug(f"Releasing host {host}") - try: - self._allocated_hosts.pop(host) - except KeyError: + if host not in self._allocated_hosts: logger.error(f"Tried to free a non-allocated host: {host}") - self._free_hosts.append(host) + else: + # remove any hosts that have had all their steps terminated + if not self._allocated_hosts[host]: + self._allocated_hosts.pop(host) + self._prioritizer.decrement(host, step_id) group_info.process_group = None group_info.redir_workers = None @@ -662,6 +780,7 @@ def _should_print_status(self) -> bool: return False def _update(self) -> None: + """Trigger all update queries and update local state database""" self._stop_steps() self._start_steps() self._refresh_statuses() @@ -749,8 +868,12 @@ def _(self, request: DragonShutdownRequest) -> DragonShutdownResponse: class DragonBackendView: - def __init__(self, backend: DragonBackend): + def __init__(self, backend: DragonBackend) -> None: + """Initialize the instance + + :param backend: A dragon backend used to produce the view""" self._backend = backend + """A dragon backend used to produce the view""" @property def host_desc(self) -> str: @@ -812,9 +935,7 @@ def step_table(self) -> str: @property def host_table(self) -> str: """Table representation of current state of nodes available - - in the allocation. - """ + in the allocation.""" headers = ["Host", "Status"] hosts = self._backend.hosts free_hosts = self._backend.free_hosts diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index 9078fed54f..e8391410bb 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -170,6 +170,7 @@ def run(self, step: Step) -> t.Optional[str]: merged_env = self._connector.merge_persisted_env(os.environ.copy()) nodes = int(run_args.get("nodes", None) or 1) tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) + hosts = str(run_args.get("host-list", "")) policy = DragonRunPolicy.from_run_args(run_args) @@ -187,6 +188,7 @@ def run(self, step: Step) -> t.Optional[str]: output_file=out, error_file=err, policy=policy, + hostlist=hosts, ) ), DragonRunResponse, diff --git a/smartsim/_core/launcher/dragon/pqueue.py b/smartsim/_core/launcher/dragon/pqueue.py new file mode 100644 index 0000000000..a9faf76b1e --- /dev/null +++ b/smartsim/_core/launcher/dragon/pqueue.py @@ -0,0 +1,467 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# import collections +import enum +import heapq +import threading +import typing as t + +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class Node(t.Protocol): + """Base Node API required to support the NodePrioritizer""" + + @property + def hostname(self) -> str: + """The hostname of the node""" + + @property + def num_cpus(self) -> int: + """The number of CPUs in the node""" + + @property + def num_gpus(self) -> int: + """The number of GPUs in the node""" + + +class NodeReferenceCount(t.Protocol): + """Contains details pertaining to references to a node""" + + @property + def hostname(self) -> str: + """The hostname of the node""" + + @property + def num_refs(self) -> int: + """The number of jobs assigned to the node""" + + +class _TrackedNode: + """Node API required to have support in the NodePrioritizer""" + + def __init__(self, node: Node) -> None: + self._node = node + """The node being tracked""" + self._num_refs = 0 + """The number of references to the tracked node""" + self._assigned_tasks: t.Set[str] = set() + """The unique identifiers of processes using this node""" + self._is_dirty = False + """Flag indicating that tracking information has been modified""" + + @property + def hostname(self) -> str: + """Returns the hostname of the node""" + return self._node.hostname + + @property + def num_cpus(self) -> int: + """Returns the number of CPUs in the node""" + return self._node.num_cpus + + @property + def num_gpus(self) -> int: + """Returns the number of GPUs attached to the node""" + return self._node.num_gpus + + @property + def num_refs(self) -> int: + """Returns the number of processes currently running on the node""" + return self._num_refs + + @property + def is_assigned(self) -> bool: + """Returns `True` if no references are currently counted, `False` otherwise""" + return self._num_refs > 0 + + @property + def assigned_tasks(self) -> t.Set[str]: + """Returns the set of unique IDs for currently running processes""" + return self._assigned_tasks + + @property + def is_dirty(self) -> bool: + """Returns a flag indicating if the reference counter has changed. `True` + if references have been added or removed, `False` otherwise.""" + return self._is_dirty + + def clean(self) -> None: + """Marks the node as unmodified""" + self._is_dirty = False + + def add( + self, + tracking_id: t.Optional[str] = None, + ) -> None: + """Update the node to indicate the addition of a process that must be + reference counted. + + :param tracking_id: a unique task identifier executing on the node + to add + :raises ValueError: if tracking_id is already assigned to this node""" + if tracking_id in self.assigned_tasks: + raise ValueError("Attempted adding task more than once") + + self._num_refs = self._num_refs + 1 + if tracking_id: + self._assigned_tasks = self._assigned_tasks.union({tracking_id}) + self._is_dirty = True + + def remove( + self, + tracking_id: t.Optional[str] = None, + ) -> None: + """Update the reference counter to indicate the removal of a process. + + :param tracking_id: a unique task identifier executing on the node + to remove + :raises ValueError: if tracking_id is already assigned to this node""" + if tracking_id and tracking_id not in self.assigned_tasks: + raise ValueError("Attempted removal of untracked item") + + self._num_refs = max(self._num_refs - 1, 0) + if tracking_id: + self._assigned_tasks = self._assigned_tasks - {tracking_id} + self._is_dirty = True + + def __lt__(self, other: "_TrackedNode") -> bool: + """Comparison operator used to evaluate the ordering of nodes within + the prioritizer. This comparison only considers reference counts. + + :param other: Another node to compare against + :returns: True if this node has fewer references than the other node""" + if self.num_refs < other.num_refs: + return True + + return False + + +class PrioritizerFilter(str, enum.Enum): + """A filter used to select a subset of nodes to be queried""" + + CPU = enum.auto() + GPU = enum.auto() + + +class NodePrioritizer: + def __init__(self, nodes: t.List[Node], lock: threading.RLock) -> None: + """Initialize the prioritizer + + :param nodes: node attribute information for initializing the priorizer + :param lock: a lock used to ensure threadsafe operations + :raises SmartSimError: if the nodes collection is empty + """ + if not nodes: + raise SmartSimError("Missing nodes to prioritize") + + self._lock = lock + """Lock used to ensure thread safe changes of the reference counters""" + self._cpu_refs: t.List[_TrackedNode] = [] + """Track reference counts to CPU-only nodes""" + self._gpu_refs: t.List[_TrackedNode] = [] + """Track reference counts to GPU nodes""" + self._nodes: t.Dict[str, _TrackedNode] = {} + + self._initialize_reference_counters(nodes) + + def _initialize_reference_counters(self, nodes: t.List[Node]) -> None: + """Perform initialization of reference counters for nodes in the allocation + + :param nodes: node attribute information for initializing the priorizer""" + for node in nodes: + # create a set of reference counters for the nodes + tracked = _TrackedNode(node) + + self._nodes[node.hostname] = tracked # for O(1) access + + if node.num_gpus: + self._gpu_refs.append(tracked) + else: + self._cpu_refs.append(tracked) + + def increment( + self, host: str, tracking_id: t.Optional[str] = None + ) -> NodeReferenceCount: + """Directly increment the reference count of a given node and ensure the + ref counter is marked as dirty to trigger a reordering on retrieval + + :param host: a hostname that should have a reference counter selected + :param tracking_id: a unique task identifier executing on the node + to add""" + with self._lock: + tracked_node = self._nodes[host] + tracked_node.add(tracking_id) + return tracked_node + + def _heapify_all_refs(self) -> t.List[_TrackedNode]: + """Combine the CPU and GPU nodes into a single heap + + :returns: list of all reference counters""" + refs = [*self._cpu_refs, *self._gpu_refs] + heapq.heapify(refs) + return refs + + def get_tracking_info(self, host: str) -> NodeReferenceCount: + """Returns the reference counter information for a single node + + :param host: a hostname that should have a reference counter selected + :returns: a reference counter for the node + :raises ValueError: if the hostname is not in the set of managed nodes""" + if host not in self._nodes: + raise ValueError("The supplied hostname was not found") + + return self._nodes[host] + + def decrement( + self, host: str, tracking_id: t.Optional[str] = None + ) -> NodeReferenceCount: + """Directly decrement the reference count of a given node and ensure the + ref counter is marked as dirty to trigger a reordering + + :param host: a hostname that should have a reference counter decremented + :param tracking_id: unique task identifier to remove""" + with self._lock: + tracked_node = self._nodes[host] + tracked_node.remove(tracking_id) + + return tracked_node + + def _create_sub_heap( + self, + hosts: t.Optional[t.List[str]] = None, + filter_on: t.Optional[PrioritizerFilter] = None, + ) -> t.List[_TrackedNode]: + """Create a new heap from the primary heap with user-specified nodes + + :param hosts: a list of hostnames used to filter the available nodes + :returns: a list of assigned reference counters + """ + nodes_tracking_info: t.List[_TrackedNode] = [] + heap = self._get_filtered_heap(filter_on) + + # Collect all the tracking info for the requested nodes... + for node in heap: + if not hosts or node.hostname in hosts: + nodes_tracking_info.append(node) + + # ... and use it to create a new heap from a specified subset of nodes + heapq.heapify(nodes_tracking_info) + + return nodes_tracking_info + + def unassigned( + self, heap: t.Optional[t.List[_TrackedNode]] = None + ) -> t.Sequence[Node]: + """Select nodes that are currently not assigned a task + + :param heap: a subset of the node heap to consider + :returns: a list of reference counts for all unassigned nodes""" + if heap is None: + heap = list(self._nodes.values()) + + nodes: t.List[_TrackedNode] = [] + for item in heap: + if item.num_refs == 0: + nodes.append(item) + return nodes + + def assigned( + self, heap: t.Optional[t.List[_TrackedNode]] = None + ) -> t.Sequence[Node]: + """Helper method to identify the nodes that are currently assigned + + :param heap: a subset of the node heap to consider + :returns: a list of reference counts for all assigned nodes""" + if heap is None: + heap = list(self._nodes.values()) + + nodes: t.List[_TrackedNode] = [] + for item in heap: + if item.num_refs > 0: + nodes.append(item) + return nodes + + def _check_satisfiable_n( + self, num_items: int, heap: t.Optional[t.List[_TrackedNode]] = None + ) -> bool: + """Validates that a request for some number of nodes `n` can be + satisfied by the prioritizer given the set of nodes available + + :param num_items: the desired number of nodes to allocate + :param heap: a subset of the node heap to consider + :returns: True if the request can be fulfilled, False otherwise""" + num_nodes = len(self._nodes.keys()) + + if num_items < 1: + msg = "Cannot handle request; request requires a positive integer" + logger.warning(msg) + return False + + if num_nodes < num_items: + msg = f"Cannot satisfy request for {num_items} nodes; {num_nodes} in pool" + logger.warning(msg) + return False + + num_open = len(self.unassigned(heap)) + if num_open < num_items: + msg = f"Cannot satisfy request for {num_items} nodes; {num_open} available" + logger.warning(msg) + return False + + return True + + def _get_next_unassigned_node( + self, + heap: t.List[_TrackedNode], + tracking_id: t.Optional[str] = None, + ) -> t.Optional[Node]: + """Finds the next node with no running processes and + ensures that any elements that were directly updated are updated in + the priority structure before being made available + + :param heap: a subset of the node heap to consider + :param tracking_id: unique task identifier to track + :returns: a reference counter for an available node if an unassigned node + exists, `None` otherwise""" + tracking_info: t.Optional[_TrackedNode] = None + + with self._lock: + # re-sort the heap to handle any tracking changes + if any(node.is_dirty for node in heap): + heapq.heapify(heap) + + # grab the min node from the heap + tracking_info = heapq.heappop(heap) + + # the node is available if it has no assigned tasks + is_assigned = tracking_info.is_assigned + if not is_assigned: + # track the new process on the node + tracking_info.add(tracking_id) + + # add the node that was popped back into the heap + heapq.heappush(heap, tracking_info) + + # mark all nodes as clean now that everything is updated & sorted + for node in heap: + node.clean() + + # next available must only return previously unassigned nodes + if is_assigned: + return None + + return tracking_info + + def _get_next_n_available_nodes( + self, + num_items: int, + heap: t.List[_TrackedNode], + tracking_id: t.Optional[str] = None, + ) -> t.List[Node]: + """Find the next N available nodes w/least amount of references using + the supplied filter to target a specific node capability + + :param num_items: number of nodes to reserve + :param heap: a subset of the node heap to consider + :param tracking_id: unique task identifier to track + :returns: a list of reference counters for a available nodes if enough + unassigned nodes exists, `None` otherwise + :raises ValueError: if the number of requested nodes is not a positive integer + """ + next_nodes: t.List[Node] = [] + + if num_items < 1: + raise ValueError(f"Number of items requested {num_items} is invalid") + + if not self._check_satisfiable_n(num_items, heap): + return next_nodes + + while len(next_nodes) < num_items: + if next_node := self._get_next_unassigned_node(heap, tracking_id): + next_nodes.append(next_node) + continue + break + + return next_nodes + + def _get_filtered_heap( + self, filter_on: t.Optional[PrioritizerFilter] = None + ) -> t.List[_TrackedNode]: + """Helper method to select the set of nodes to include in a filtered + heap. + + :param filter_on: A list of nodes that satisfy the filter. If no + filter is supplied, all nodes are returned""" + if filter_on == PrioritizerFilter.GPU: + return self._gpu_refs + if filter_on == PrioritizerFilter.CPU: + return self._cpu_refs + + return self._heapify_all_refs() + + def next( + self, + filter_on: t.Optional[PrioritizerFilter] = None, + tracking_id: t.Optional[str] = None, + hosts: t.Optional[t.List[str]] = None, + ) -> t.Optional[Node]: + """Find the next unsassigned node using the supplied filter to target + a specific node capability + + :param filter_on: the subset of nodes to query for available nodes + :param tracking_id: unique task identifier to track + :param hosts: a list of hostnames used to filter the available nodes + :returns: a reference counter for an available node if an unassigned node + exists, `None` otherwise""" + if results := self.next_n(1, filter_on, tracking_id, hosts): + return results[0] + return None + + def next_n( + self, + num_items: int = 1, + filter_on: t.Optional[PrioritizerFilter] = None, + tracking_id: t.Optional[str] = None, + hosts: t.Optional[t.List[str]] = None, + ) -> t.List[Node]: + """Find the next N available nodes w/least amount of references using + the supplied filter to target a specific node capability + + :param num_items: number of nodes to reserve + :param filter_on: the subset of nodes to query for available nodes + :param tracking_id: unique task identifier to track + :param hosts: a list of hostnames used to filter the available nodes + :returns: Collection of reserved nodes + :raises ValueError: if the hosts parameter is an empty list""" + if hosts is not None and not hosts: + raise ValueError("No hostnames provided") + + heap = self._create_sub_heap(hosts, filter_on) + return self._get_next_n_available_nodes(num_items, heap, tracking_id) diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py index dd93d7910c..21fdc697c4 100644 --- a/smartsim/_core/launcher/step/dragonStep.py +++ b/smartsim/_core/launcher/step/dragonStep.py @@ -169,6 +169,7 @@ def _write_request_file(self) -> str: env = run_settings.env_vars nodes = int(run_args.get("nodes", None) or 1) tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) + hosts_csv = str(run_args.get("host-list", "")) policy = DragonRunPolicy.from_run_args(run_args) @@ -187,6 +188,7 @@ def _write_request_file(self) -> str: output_file=out, error_file=err, policy=policy, + hostlist=hosts_csv, ) requests.append(request_registry.to_string(request)) with open(request_file, "w", encoding="utf-8") as script_file: diff --git a/smartsim/settings/dragonRunSettings.py b/smartsim/settings/dragonRunSettings.py index 69a91547e7..15e5855448 100644 --- a/smartsim/settings/dragonRunSettings.py +++ b/smartsim/settings/dragonRunSettings.py @@ -95,6 +95,26 @@ def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None: self.run_args["node-feature"] = ",".join(feature_list) + @override + def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: + """Specify the hostlist for this job + + :param host_list: hosts to launch on + :raises ValueError: if an empty host list is supplied + """ + if not host_list: + raise ValueError("empty hostlist provided") + + if isinstance(host_list, str): + host_list = host_list.replace(" ", "").split(",") + + # strip out all whitespace-only values + cleaned_list = [host.strip() for host in host_list if host and host.strip()] + if not len(cleaned_list) == len(host_list): + raise ValueError(f"invalid names found in hostlist: {host_list}") + + self.run_args["host-list"] = ",".join(cleaned_list) + def set_cpu_affinity(self, devices: t.List[int]) -> None: """Set the CPU affinity for this job diff --git a/tests/test_dragon_run_request.py b/tests/test_dragon_run_request.py index 94c17c222a..5ff95f4087 100644 --- a/tests/test_dragon_run_request.py +++ b/tests/test_dragon_run_request.py @@ -30,18 +30,14 @@ import time from unittest.mock import MagicMock +import pydantic.error_wrappers import pytest -from pydantic import ValidationError + +from smartsim._core.launcher.dragon.pqueue import NodePrioritizer # The tests in this file belong to the group_b group pytestmark = pytest.mark.group_b - -try: - import dragon - - dragon_loaded = True -except: - dragon_loaded = False +dragon = pytest.importorskip("dragon") from smartsim._core.config import CONFIG from smartsim._core.schemas.dragonRequests import * @@ -56,38 +52,6 @@ ) -class NodeMock(MagicMock): - def __init__( - self, name: t.Optional[str] = None, num_gpus: int = 2, num_cpus: int = 8 - ) -> None: - super().__init__() - self._mock_id = name - NodeMock._num_gpus = num_gpus - NodeMock._num_cpus = num_cpus - - @property - def hostname(self) -> str: - if self._mock_id: - return self._mock_id - return create_short_id_str() - - @property - def num_cpus(self) -> str: - return NodeMock._num_cpus - - @property - def num_gpus(self) -> str: - return NodeMock._num_gpus - - def _set_id(self, value: str) -> None: - self._mock_id = value - - def gpus(self, parent: t.Any = None) -> t.List[str]: - if self._num_gpus: - return [f"{self.hostname}-gpu{i}" for i in range(NodeMock._num_gpus)] - return [] - - class GroupStateMock(MagicMock): def Running(self) -> MagicMock: running = MagicMock(**{"__str__.return_value": "Running"}) @@ -102,69 +66,59 @@ class ProcessGroupMock(MagicMock): puids = [121, 122] -def node_mock() -> NodeMock: - return NodeMock() - - def get_mock_backend( - monkeypatch: pytest.MonkeyPatch, num_gpus: int = 2 + monkeypatch: pytest.MonkeyPatch, num_cpus: int, num_gpus: int ) -> "DragonBackend": - + # create all the necessary namespaces as raw magic mocks + monkeypatch.setitem(sys.modules, "dragon.data.ddict.ddict", MagicMock()) + monkeypatch.setitem(sys.modules, "dragon.native.machine", MagicMock()) + monkeypatch.setitem(sys.modules, "dragon.native.group_state", MagicMock()) + monkeypatch.setitem(sys.modules, "dragon.native.process_group", MagicMock()) + monkeypatch.setitem(sys.modules, "dragon.native.process", MagicMock()) + monkeypatch.setitem(sys.modules, "dragon.infrastructure.connection", MagicMock()) + monkeypatch.setitem(sys.modules, "dragon.infrastructure.policy", MagicMock()) + monkeypatch.setitem(sys.modules, "dragon.infrastructure.process_desc", MagicMock()) + monkeypatch.setitem(sys.modules, "dragon.data.ddict.ddict", MagicMock()) + + node_list = ["node1", "node2", "node3"] + system_mock = MagicMock(return_value=MagicMock(nodes=node_list)) + node_mock = lambda x: MagicMock(hostname=x, num_cpus=num_cpus, num_gpus=num_gpus) + process_group_mock = MagicMock(return_value=ProcessGroupMock()) process_mock = MagicMock(returncode=0) - process_group_mock = MagicMock(**{"Process.return_value": ProcessGroupMock()}) - process_module_mock = MagicMock() - process_module_mock.Process = process_mock - node_mock = NodeMock(num_gpus=num_gpus) - system_mock = MagicMock(nodes=["node1", "node2", "node3"]) + policy_mock = MagicMock(return_value=MagicMock()) + group_state_mock = GroupStateMock() + + # customize members that must perform specific actions within the namespaces monkeypatch.setitem( sys.modules, "dragon", MagicMock( **{ - "native.machine.Node.return_value": node_mock, - "native.machine.System.return_value": system_mock, - "native.group_state": GroupStateMock(), - "native.process_group.ProcessGroup.return_value": ProcessGroupMock(), + "native.machine.Node": node_mock, + "native.machine.System": system_mock, + "native.group_state": group_state_mock, + "native.process_group.ProcessGroup": process_group_mock, + "native.process_group.Process": process_mock, + "native.process.Process": process_mock, + "infrastructure.policy.Policy": policy_mock, } ), ) - monkeypatch.setitem( - sys.modules, - "dragon.infrastructure.connection", - MagicMock(), - ) - monkeypatch.setitem( - sys.modules, - "dragon.infrastructure.process_desc", - MagicMock(), - ) - monkeypatch.setitem( - sys.modules, - "dragon.data.ddict.ddict", - MagicMock(), - ) - monkeypatch.setitem( - sys.modules, - "dragon.infrastructure.policy", - MagicMock(**{"Policy.return_value": MagicMock()}), - ) - monkeypatch.setitem(sys.modules, "dragon.native.process", process_module_mock) - monkeypatch.setitem(sys.modules, "dragon.native.process_group", process_group_mock) - monkeypatch.setitem(sys.modules, "dragon.native.group_state", GroupStateMock()) - monkeypatch.setitem( - sys.modules, - "dragon.native.machine", - MagicMock( - **{"System.return_value": system_mock, "Node.return_value": node_mock} - ), - ) from smartsim._core.launcher.dragon.dragonBackend import DragonBackend dragon_backend = DragonBackend(pid=99999) - monkeypatch.setattr( - dragon_backend, "_free_hosts", collections.deque(dragon_backend._hosts) + + # NOTE: we're manually updating these values due to issue w/mocking namespaces + dragon_backend._prioritizer = NodePrioritizer( + [ + MagicMock(num_cpus=num_cpus, num_gpus=num_gpus, hostname=node) + for node in node_list + ], + dragon_backend._queue_lock, ) + dragon_backend._cpus = [num_cpus] * len(node_list) + dragon_backend._gpus = [num_gpus] * len(node_list) return dragon_backend @@ -222,16 +176,14 @@ def set_mock_group_infos( } monkeypatch.setattr(dragon_backend, "_group_infos", group_infos) - monkeypatch.setattr(dragon_backend, "_free_hosts", collections.deque(hosts[1:3])) monkeypatch.setattr(dragon_backend, "_allocated_hosts", {hosts[0]: "abc123-1"}) monkeypatch.setattr(dragon_backend, "_running_steps", ["abc123-1"]) return group_infos -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_handshake_request(monkeypatch: pytest.MonkeyPatch) -> None: - dragon_backend = get_mock_backend(monkeypatch) + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) handshake_req = DragonHandshakeRequest() handshake_resp = dragon_backend.process_request(handshake_req) @@ -240,9 +192,8 @@ def test_handshake_request(monkeypatch: pytest.MonkeyPatch) -> None: assert handshake_resp.dragon_pid == 99999 -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: - dragon_backend = get_mock_backend(monkeypatch) + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) run_req = DragonRunRequest( exe="sleep", exe_args=["5"], @@ -269,7 +220,7 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend._running_steps == [step_id] assert len(dragon_backend._queued_steps) == 0 - assert len(dragon_backend._free_hosts) == 1 + assert len(dragon_backend.free_hosts) == 1 assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id @@ -281,7 +232,7 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend._running_steps == [step_id] assert len(dragon_backend._queued_steps) == 0 - assert len(dragon_backend._free_hosts) == 1 + assert len(dragon_backend.free_hosts) == 1 assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id @@ -291,9 +242,8 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: assert not dragon_backend._running_steps -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_deny_run_request(monkeypatch: pytest.MonkeyPatch) -> None: - dragon_backend = get_mock_backend(monkeypatch) + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) dragon_backend._shutdown_requested = True @@ -319,7 +269,7 @@ def test_deny_run_request(monkeypatch: pytest.MonkeyPatch) -> None: def test_run_request_with_empty_policy(monkeypatch: pytest.MonkeyPatch) -> None: """Verify that a policy is applied to a run request""" - dragon_backend = get_mock_backend(monkeypatch) + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) run_req = DragonRunRequest( exe="sleep", exe_args=["5"], @@ -335,10 +285,9 @@ def test_run_request_with_empty_policy(monkeypatch: pytest.MonkeyPatch) -> None: assert run_req.policy is None -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None: """Verify that a policy is applied to a run request""" - dragon_backend = get_mock_backend(monkeypatch) + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) run_req = DragonRunRequest( exe="sleep", exe_args=["5"], @@ -366,7 +315,7 @@ def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend._running_steps == [step_id] assert len(dragon_backend._queued_steps) == 0 - assert len(dragon_backend._free_hosts) == 1 + assert len(dragon_backend._prioritizer.unassigned()) == 1 assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id @@ -378,7 +327,7 @@ def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend._running_steps == [step_id] assert len(dragon_backend._queued_steps) == 0 - assert len(dragon_backend._free_hosts) == 1 + assert len(dragon_backend._prioritizer.unassigned()) == 1 assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id @@ -388,9 +337,8 @@ def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None: assert not dragon_backend._running_steps -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_udpate_status_request(monkeypatch: pytest.MonkeyPatch) -> None: - dragon_backend = get_mock_backend(monkeypatch) + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) group_infos = set_mock_group_infos(monkeypatch, dragon_backend) @@ -405,9 +353,8 @@ def test_udpate_status_request(monkeypatch: pytest.MonkeyPatch) -> None: } -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_stop_request(monkeypatch: pytest.MonkeyPatch) -> None: - dragon_backend = get_mock_backend(monkeypatch) + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) group_infos = set_mock_group_infos(monkeypatch, dragon_backend) running_steps = [ @@ -434,10 +381,9 @@ def test_stop_request(monkeypatch: pytest.MonkeyPatch) -> None: ) assert len(dragon_backend._allocated_hosts) == 0 - assert len(dragon_backend._free_hosts) == 3 + assert len(dragon_backend._prioritizer.unassigned()) == 3 -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize( "immediate, kill_jobs, frontend_shutdown", [ @@ -456,7 +402,7 @@ def test_shutdown_request( frontend_shutdown: bool, ) -> None: monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", "0") - dragon_backend = get_mock_backend(monkeypatch) + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) monkeypatch.setattr(dragon_backend, "_cooldown_period", 1) set_mock_group_infos(monkeypatch, dragon_backend) @@ -496,11 +442,10 @@ def test_shutdown_request( assert dragon_backend._has_cooled_down == kill_jobs -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize("telemetry_flag", ["0", "1"]) def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) -> None: monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", telemetry_flag) - dragon_backend = get_mock_backend(monkeypatch) + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) expected_cooldown = ( 2 * CONFIG.telemetry_frequency + 5 if int(telemetry_flag) > 0 else 5 @@ -512,19 +457,17 @@ def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) - assert dragon_backend.cooldown_period == expected_cooldown -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_heartbeat_and_time(monkeypatch: pytest.MonkeyPatch) -> None: - dragon_backend = get_mock_backend(monkeypatch) + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) first_heartbeat = dragon_backend.last_heartbeat assert dragon_backend.current_time > first_heartbeat dragon_backend._heartbeat() assert dragon_backend.last_heartbeat > first_heartbeat -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize("num_nodes", [1, 3, 100]) def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None: - dragon_backend = get_mock_backend(monkeypatch) + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) run_req = DragonRunRequest( exe="sleep", exe_args=["5"], @@ -537,18 +480,42 @@ def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None: pmi_enabled=False, ) - assert dragon_backend._can_honor(run_req)[0] == ( - num_nodes <= len(dragon_backend._hosts) - ) + can_honor, error_msg = dragon_backend._can_honor(run_req) + + nodes_in_range = num_nodes <= len(dragon_backend._hosts) + assert can_honor == nodes_in_range + assert error_msg is None if nodes_in_range else error_msg is not None + + +@pytest.mark.parametrize("num_nodes", [-10, -1, 0]) +def test_can_honor_invalid_num_nodes( + monkeypatch: pytest.MonkeyPatch, num_nodes: int +) -> None: + """Verify that requests for invalid numbers of nodes (negative, zero) are rejected""" + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) + + with pytest.raises(pydantic.error_wrappers.ValidationError) as ex: + DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=num_nodes, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + ) -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize("affinity", [[0], [0, 1], list(range(8))]) def test_can_honor_cpu_affinity( monkeypatch: pytest.MonkeyPatch, affinity: t.List[int] ) -> None: """Verify that valid CPU affinities are accepted""" - dragon_backend = get_mock_backend(monkeypatch) + num_cpus, num_gpus = 8, 0 + dragon_backend = get_mock_backend(monkeypatch, num_cpus=num_cpus, num_gpus=num_gpus) + run_req = DragonRunRequest( exe="sleep", exe_args=["5"], @@ -565,11 +532,10 @@ def test_can_honor_cpu_affinity( assert dragon_backend._can_honor(run_req)[0] -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_can_honor_cpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) -> None: """Verify that invalid CPU affinities are NOT accepted NOTE: negative values are captured by the Pydantic schema""" - dragon_backend = get_mock_backend(monkeypatch) + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) run_req = DragonRunRequest( exe="sleep", exe_args=["5"], @@ -586,13 +552,15 @@ def test_can_honor_cpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) -> assert not dragon_backend._can_honor(run_req)[0] -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize("affinity", [[0], [0, 1]]) def test_can_honor_gpu_affinity( monkeypatch: pytest.MonkeyPatch, affinity: t.List[int] ) -> None: """Verify that valid GPU affinities are accepted""" - dragon_backend = get_mock_backend(monkeypatch) + + num_cpus, num_gpus = 8, 2 + dragon_backend = get_mock_backend(monkeypatch, num_cpus=num_cpus, num_gpus=num_gpus) + run_req = DragonRunRequest( exe="sleep", exe_args=["5"], @@ -609,11 +577,10 @@ def test_can_honor_gpu_affinity( assert dragon_backend._can_honor(run_req)[0] -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_can_honor_gpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) -> None: """Verify that invalid GPU affinities are NOT accepted NOTE: negative values are captured by the Pydantic schema""" - dragon_backend = get_mock_backend(monkeypatch) + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) run_req = DragonRunRequest( exe="sleep", exe_args=["5"], @@ -630,46 +597,45 @@ def test_can_honor_gpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) -> assert not dragon_backend._can_honor(run_req)[0] -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_can_honor_gpu_device_not_available(monkeypatch: pytest.MonkeyPatch) -> None: """Verify that a request for a GPU if none exists is not accepted""" # create a mock node class that always reports no GPUs available - dragon_backend = get_mock_backend(monkeypatch, num_gpus=0) - - run_req = DragonRunRequest( - exe="sleep", - exe_args=["5"], - path="/a/fake/path", - nodes=2, - tasks=1, - tasks_per_node=1, - env={}, - current_env={}, - pmi_enabled=False, - # specify GPU device w/no affinity - policy=DragonRunPolicy(gpu_affinity=[0]), - ) - - assert not dragon_backend._can_honor(run_req)[0] + with monkeypatch.context() as ctx: + dragon_backend = get_mock_backend(ctx, num_cpus=8, num_gpus=0) + + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + # specify GPU device w/no affinity + policy=DragonRunPolicy(gpu_affinity=[0]), + ) + can_honor, _ = dragon_backend._can_honor(run_req) + assert not can_honor -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_get_id(monkeypatch: pytest.MonkeyPatch) -> None: - dragon_backend = get_mock_backend(monkeypatch) + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) step_id = next(dragon_backend._step_ids) assert step_id.endswith("0") assert step_id != next(dragon_backend._step_ids) -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_view(monkeypatch: pytest.MonkeyPatch) -> None: - dragon_backend = get_mock_backend(monkeypatch) + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) set_mock_group_infos(monkeypatch, dragon_backend) hosts = dragon_backend.hosts + dragon_backend._prioritizer.increment(hosts[0]) - expected_message = textwrap.dedent(f"""\ + expected_msg = textwrap.dedent(f"""\ Dragon server backend update | Host | Status | |--------|----------| @@ -677,7 +643,7 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None: | {hosts[1]} | Free | | {hosts[2]} | Free | | Step | Status | Hosts | Return codes | Num procs | - |----------|--------------|-------------|----------------|-------------| + |----------|--------------|-----------------|----------------|-------------| | abc123-1 | Running | {hosts[0]} | | 1 | | del999-2 | Cancelled | {hosts[1]} | -9 | 1 | | c101vz-3 | Completed | {hosts[1]},{hosts[2]} | 0 | 2 | @@ -686,6 +652,79 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None: # get rid of white space to make the comparison easier actual_msg = dragon_backend.status_message.replace(" ", "") - expected_message = expected_message.replace(" ", "") + expected_msg = expected_msg.replace(" ", "") + + # ignore dashes in separators (hostname changes may cause column expansion) + while actual_msg.find("--") > -1: + actual_msg = actual_msg.replace("--", "-") + while expected_msg.find("--") > -1: + expected_msg = expected_msg.replace("--", "-") + + assert actual_msg == expected_msg + + +def test_can_honor_hosts_unavailable_hosts(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that requesting nodes with invalid names causes number of available + nodes check to fail due to valid # of named nodes being under num_nodes""" + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) + + # let's supply 2 invalid and 1 valid hostname + actual_hosts = list(dragon_backend._hosts) + actual_hosts[0] = f"x{actual_hosts[0]}" + actual_hosts[1] = f"x{actual_hosts[1]}" + + host_list = ",".join(actual_hosts) + + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, # <----- requesting 2 of 3 available nodes + hostlist=host_list, # <--- only one valid name available + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(), + ) + + can_honor, error_msg = dragon_backend._can_honor(run_req) + + # confirm the failure is indicated + assert not can_honor + # confirm failure message indicates number of nodes requested as cause + assert "named hosts" in error_msg + + +def test_can_honor_hosts_unavailable_hosts_ok(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that requesting nodes with invalid names causes number of available + nodes check to be reduced but still passes if enough valid named nodes are passed""" + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) + + # let's supply 2 valid and 1 invalid hostname + actual_hosts = list(dragon_backend._hosts) + actual_hosts[0] = f"x{actual_hosts[0]}" + + host_list = ",".join(actual_hosts) + + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, # <----- requesting 2 of 3 available nodes + hostlist=host_list, # <--- two valid names are available + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(), + ) + + can_honor, error_msg = dragon_backend._can_honor(run_req) - assert actual_msg == expected_message + # confirm the failure is indicated + assert can_honor, error_msg + # confirm failure message indicates number of nodes requested as cause + assert error_msg is None, error_msg diff --git a/tests/test_dragon_runsettings.py b/tests/test_dragon_runsettings.py index 34e8510e82..8c7600c74c 100644 --- a/tests/test_dragon_runsettings.py +++ b/tests/test_dragon_runsettings.py @@ -96,3 +96,122 @@ def test_dragon_runsettings_gpu_affinity(): # ensure the value is not changed when we extend the list rs.run_args["gpu-affinity"] = "7,8,9" assert rs.run_args["gpu-affinity"] != ",".join(str(val) for val in exp_value) + + +def test_dragon_runsettings_hostlist_null(): + """Verify that passing a null hostlist is treated as a failure""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + # baseline check that no host list exists + stored_list = rs.run_args.get("host-list", None) + assert stored_list is None + + with pytest.raises(ValueError) as ex: + rs.set_hostlist(None) + + assert "empty hostlist" in ex.value.args[0] + + +def test_dragon_runsettings_hostlist_empty(): + """Verify that passing an empty hostlist is treated as a failure""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + # baseline check that no host list exists + stored_list = rs.run_args.get("host-list", None) + assert stored_list is None + + with pytest.raises(ValueError) as ex: + rs.set_hostlist([]) + + assert "empty hostlist" in ex.value.args[0] + + +@pytest.mark.parametrize("hostlist_csv", [" ", " , , , ", ",", ",,,"]) +def test_dragon_runsettings_hostlist_whitespace_handling(hostlist_csv: str): + """Verify that passing a hostlist with emptystring host names is treated as a failure""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + # baseline check that no host list exists + stored_list = rs.run_args.get("host-list", None) + assert stored_list is None + + # empty string as hostname in list + with pytest.raises(ValueError) as ex: + rs.set_hostlist(hostlist_csv) + + assert "invalid names" in ex.value.args[0] + + +@pytest.mark.parametrize( + "hostlist_csv", [[" "], [" ", "", " ", " "], ["", " "], ["", "", "", ""]] +) +def test_dragon_runsettings_hostlist_whitespace_handling_list(hostlist_csv: str): + """Verify that passing a hostlist with emptystring host names contained in a list + is treated as a failure""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + # baseline check that no host list exists + stored_list = rs.run_args.get("host-list", None) + assert stored_list is None + + # empty string as hostname in list + with pytest.raises(ValueError) as ex: + rs.set_hostlist(hostlist_csv) + + assert "invalid names" in ex.value.args[0] + + +def test_dragon_runsettings_hostlist_as_csv(): + """Verify that a hostlist is stored properly when passing in a CSV string""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + # baseline check that no host list exists + stored_list = rs.run_args.get("host-list", None) + assert stored_list is None + + hostnames = ["host0", "host1", "host2", "host3", "host4"] + + # set the host list with ideal comma separated values + input0 = ",".join(hostnames) + + # set the host list with a string of comma separated values + # including extra whitespace + input1 = ", ".join(hostnames) + + for hosts_input in [input0, input1]: + rs.set_hostlist(hosts_input) + + stored_list = rs.run_args.get("host-list", None) + assert stored_list + + # confirm that all values from the original list are retrieved + split_stored_list = stored_list.split(",") + assert set(hostnames) == set(split_stored_list) + + +def test_dragon_runsettings_hostlist_as_csv(): + """Verify that a hostlist is stored properly when passing in a CSV string""" + rs = DragonRunSettings(exe="sleep", exe_args=["1"]) + + # baseline check that no host list exists + stored_list = rs.run_args.get("host-list", None) + assert stored_list is None + + hostnames = ["host0", "host1", "host2", "host3", "host4"] + + # set the host list with ideal comma separated values + input0 = ",".join(hostnames) + + # set the host list with a string of comma separated values + # including extra whitespace + input1 = ", ".join(hostnames) + + for hosts_input in [input0, input1]: + rs.set_hostlist(hosts_input) + + stored_list = rs.run_args.get("host-list", None) + assert stored_list + + # confirm that all values from the original list are retrieved + split_stored_list = stored_list.split(",") + assert set(hostnames) == set(split_stored_list) diff --git a/tests/test_dragon_step.py b/tests/test_dragon_step.py index 19f408e0bd..f933fb7bc2 100644 --- a/tests/test_dragon_step.py +++ b/tests/test_dragon_step.py @@ -73,12 +73,18 @@ def dragon_batch_step(test_dir: str) -> DragonBatchStep: cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] + # specify 3 hostnames to select from but require only 2 nodes + num_nodes = 2 + hostnames = ["host1", "host2", "host3"] + # assign some unique affinities to each run setting instance for index, rs in enumerate(settings): if gpu_affinities[index]: rs.set_node_feature("gpu") rs.set_cpu_affinity(cpu_affinities[index]) rs.set_gpu_affinity(gpu_affinities[index]) + rs.set_hostlist(hostnames) + rs.set_nodes(num_nodes) steps = list( DragonStep(name_, test_dir, rs_) for name_, rs_ in zip(names, settings) @@ -374,6 +380,11 @@ def test_dragon_batch_step_write_request_file( cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]] gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]] + hostnames = ["host1", "host2", "host3"] + num_nodes = 2 + + # parse requests file path from the launch command + # e.g. dragon python launch_cmd = dragon_batch_step.get_launch_cmd() requests_file = get_request_path_from_batch_script(launch_cmd) @@ -392,3 +403,5 @@ def test_dragon_batch_step_write_request_file( assert run_request assert run_request.policy.cpu_affinity == cpu_affinities[index] assert run_request.policy.gpu_affinity == gpu_affinities[index] + assert run_request.nodes == num_nodes + assert run_request.hostlist == ",".join(hostnames) diff --git a/tests/test_node_prioritizer.py b/tests/test_node_prioritizer.py new file mode 100644 index 0000000000..abb4624b6a --- /dev/null +++ b/tests/test_node_prioritizer.py @@ -0,0 +1,555 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import random +import threading +import typing as t + +import pytest + +from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + +logger = get_logger(__name__) + + +class MockNode: + def __init__(self, hostname: str, num_cpus: int, num_gpus: int) -> None: + self.hostname = hostname + self.num_cpus = num_cpus + self.num_gpus = num_gpus + + +def mock_node_hosts( + num_cpu_nodes: int, num_gpu_nodes: int +) -> t.Tuple[t.List[MockNode], t.List[MockNode]]: + cpu_hosts = [f"cpu-node-{i}" for i in range(num_cpu_nodes)] + gpu_hosts = [f"gpu-node-{i}" for i in range(num_gpu_nodes)] + + return cpu_hosts, gpu_hosts + + +def mock_node_builder(num_cpu_nodes: int, num_gpu_nodes: int) -> t.List[MockNode]: + nodes = [] + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + + nodes.extend(MockNode(hostname, 4, 0) for hostname in cpu_hosts) + nodes.extend(MockNode(hostname, 4, 4) for hostname in gpu_hosts) + + return nodes + + +def test_node_prioritizer_init_null() -> None: + """Verify that the priorizer reports failures to send a valid node set + if a null value is passed""" + lock = threading.RLock() + with pytest.raises(SmartSimError) as ex: + NodePrioritizer(None, lock) + + assert "Missing" in ex.value.args[0] + + +def test_node_prioritizer_init_empty() -> None: + """Verify that the priorizer reports failures to send a valid node set + if an empty list is passed""" + lock = threading.RLock() + with pytest.raises(SmartSimError) as ex: + NodePrioritizer([], lock) + + assert "Missing" in ex.value.args[0] + + +@pytest.mark.parametrize( + "num_cpu_nodes,num_gpu_nodes", [(1, 1), (2, 1), (1, 2), (8, 4), (1000, 200)] +) +def test_node_prioritizer_init_ok(num_cpu_nodes: int, num_gpu_nodes: int) -> None: + """Verify that initialization with a valid node list results in the + appropriate cpu & gpu ref counts, and complete ref map""" + nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes) + + # perform prioritizer initialization + lock = threading.RLock() + p = NodePrioritizer(nodes, lock) + + # get a copy of all the expected host names + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + all_hosts = cpu_hosts + gpu_hosts + assert len(all_hosts) == num_cpu_nodes + num_gpu_nodes + + # verify tracking data is initialized correctly for all nodes + for hostname in all_hosts: + # show that the ref map is tracking the node + assert hostname in p._nodes + + tracking_info = p.get_tracking_info(hostname) + + # show that the node is created w/zero ref counts + assert tracking_info.num_refs == 0 + + # show that the node is created and marked as not dirty (unchanged) + # assert tracking_info.is_dirty == False + + # iterate through known cpu node keys and verify prioritizer initialization + for hostname in cpu_hosts: + # show that the device ref counters are appropriately assigned + cpu_ref = next((n for n in p._cpu_refs if n.hostname == hostname), None) + assert cpu_ref, "CPU-only node not found in cpu ref set" + + gpu_ref = next((n for n in p._gpu_refs if n.hostname == hostname), None) + assert not gpu_ref, "CPU-only node should not be found in gpu ref set" + + # iterate through known GPU node keys and verify prioritizer initialization + for hostname in gpu_hosts: + # show that the device ref counters are appropriately assigned + gpu_ref = next((n for n in p._gpu_refs if n.hostname == hostname), None) + assert gpu_ref, "GPU-only node not found in gpu ref set" + + cpu_ref = next((n for n in p._cpu_refs if n.hostname == hostname), None) + assert not cpu_ref, "GPU-only node should not be found in cpu ref set" + + # verify we have all hosts in the ref map + assert set(p._nodes.keys()) == set(all_hosts) + + # verify we have no extra hosts in ref map + assert len(p._nodes.keys()) == len(set(all_hosts)) + + +def test_node_prioritizer_direct_increment() -> None: + """Verify that performing the increment operation causes the expected + side effect on the intended records""" + + num_cpu_nodes, num_gpu_nodes = 32, 8 + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes) + + exclude_index = 2 + exclude_host0 = cpu_hosts[exclude_index] + exclude_host1 = gpu_hosts[exclude_index] + exclusions = [exclude_host0, exclude_host1] + + lock = threading.RLock() + p = NodePrioritizer(nodes, lock) + + # let's increment each element in a predictable way and verify + for node in nodes: + if node.hostname in exclusions: + # expect 1 cpu and 1 gpu node at zero and not incremented + continue + + if node.num_gpus == 0: + num_increments = random.randint(0, num_cpu_nodes - 1) + else: + num_increments = random.randint(0, num_gpu_nodes - 1) + + # increment this node some random number of times + for _ in range(num_increments): + p.increment(node.hostname) + + # ... and verify the correct incrementing is applied + tracking_info = p.get_tracking_info(node.hostname) + assert tracking_info.num_refs == num_increments + + # verify the excluded cpu node was never changed + tracking_info0 = p.get_tracking_info(exclude_host0) + assert tracking_info0.num_refs == 0 + + # verify the excluded gpu node was never changed + tracking_info1 = p.get_tracking_info(exclude_host1) + assert tracking_info1.num_refs == 0 + + +def test_node_prioritizer_indirect_increment() -> None: + """Verify that performing the increment operation indirectly affects + each available node until we run out of nodes to return""" + + num_cpu_nodes, num_gpu_nodes = 8, 0 + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes) + + lock = threading.RLock() + p = NodePrioritizer(nodes, lock) + + # verify starting state + for node in p._nodes.values(): + tracking_info = p.get_tracking_info(node.hostname) + + assert node.num_refs == 0 # <--- ref count starts at zero + assert tracking_info.num_refs == 0 # <--- ref count starts at zero + + # perform indirect + for node in p._nodes.values(): + tracking_info = p.get_tracking_info(node.hostname) + + # apply `next` operation and verify tracking info reflects new ref + node = p.next(PrioritizerFilter.CPU) + tracking_info = p.get_tracking_info(node.hostname) + + # verify side-effects + assert tracking_info.num_refs > 0 # <--- ref count should now be > 0 + + # we expect it to give back only "clean" nodes from next* + assert tracking_info.is_dirty == False # NOTE: this is "hidden" by protocol + + # every node should be incremented now. prioritizer shouldn't have anything to give + tracking_info = p.next(PrioritizerFilter.CPU) + assert tracking_info is None # <--- get_next shouldn't have any nodes to give + + +def test_node_prioritizer_indirect_decrement_availability() -> None: + """Verify that a node who is decremented (dirty) is made assignable + on a subsequent request""" + + num_cpu_nodes, num_gpu_nodes = 1, 0 + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes) + + lock = threading.RLock() + p = NodePrioritizer(nodes, lock) + + # increment our only node... + p.increment(cpu_hosts[0]) + + tracking_info = p.next() + assert tracking_info is None, "No nodes should be assignable" + + # perform a decrement... + p.decrement(cpu_hosts[0]) + + # ... and confirm that the node is available again + tracking_info = p.next() + assert tracking_info is not None, "A node should be assignable" + + +def test_node_prioritizer_multi_increment() -> None: + """Verify that retrieving multiple nodes via `next_n` API correctly + increments reference counts and returns appropriate results""" + + num_cpu_nodes, num_gpu_nodes = 8, 0 + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes) + + lock = threading.RLock() + p = NodePrioritizer(nodes, lock) + + # Mark some nodes as dirty to verify retrieval + p.increment(cpu_hosts[0]) + assert p.get_tracking_info(cpu_hosts[0]).num_refs > 0 + + p.increment(cpu_hosts[2]) + assert p.get_tracking_info(cpu_hosts[2]).num_refs > 0 + + p.increment(cpu_hosts[4]) + assert p.get_tracking_info(cpu_hosts[4]).num_refs > 0 + + # use next_n w/the minimum allowed value + all_tracking_info = p.next_n(1, PrioritizerFilter.CPU) # <---- next_n(1) + + # confirm the number requested is honored + assert len(all_tracking_info) == 1 + # ensure no unavailable node is returned + assert all_tracking_info[0].hostname not in [ + cpu_hosts[0], + cpu_hosts[2], + cpu_hosts[4], + ] + + # use next_n w/value that exceeds available number of open nodes + # 3 direct increments in setup, 1 out of next_n(1), 4 left + all_tracking_info = p.next_n(5, PrioritizerFilter.CPU) + + # confirm that no nodes are returned, even though 4 out of 5 requested are available + assert len(all_tracking_info) == 0 + + +def test_node_prioritizer_multi_increment_validate_n() -> None: + """Verify that retrieving multiple nodes via `next_n` API correctly + reports failures when the request size is above pool size""" + + num_cpu_nodes, num_gpu_nodes = 8, 0 + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes) + + lock = threading.RLock() + p = NodePrioritizer(nodes, lock) + + # we have 8 total cpu nodes available... request too many nodes + all_tracking_info = p.next_n(9, PrioritizerFilter.CPU) + assert len(all_tracking_info) == 0 + + all_tracking_info = p.next_n(num_cpu_nodes * 1000, PrioritizerFilter.CPU) + assert len(all_tracking_info) == 0 + + +def test_node_prioritizer_indirect_direct_interleaved_increments() -> None: + """Verify that interleaving indirect and direct increments results in + expected ref counts""" + + num_cpu_nodes, num_gpu_nodes = 8, 4 + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes) + + lock = threading.RLock() + p = NodePrioritizer(nodes, lock) + + # perform some set of non-popped increments + p.increment(gpu_hosts[1]) + p.increment(gpu_hosts[3]) + p.increment(gpu_hosts[3]) + + # increment 0th item 1x + p.increment(cpu_hosts[0]) + + # increment 3th item 2x + p.increment(cpu_hosts[3]) + p.increment(cpu_hosts[3]) + + # increment last item 3x + p.increment(cpu_hosts[7]) + p.increment(cpu_hosts[7]) + p.increment(cpu_hosts[7]) + + tracking_info = p.get_tracking_info(gpu_hosts[1]) + assert tracking_info.num_refs == 1 + + tracking_info = p.get_tracking_info(gpu_hosts[3]) + assert tracking_info.num_refs == 2 + + nodes = [n for n in p._nodes.values() if n.num_refs == 0 and n.num_gpus == 0] + + # we should skip the 0-th item in the heap due to direct increment + tracking_info = p.next(PrioritizerFilter.CPU) + assert tracking_info.num_refs == 1 + # confirm we get a cpu node + assert "cpu-node" in tracking_info.hostname + + # this should pull the next item right out + tracking_info = p.next(PrioritizerFilter.CPU) + assert tracking_info.num_refs == 1 + assert "cpu-node" in tracking_info.hostname + + # ensure we pull from gpu nodes and the 0th item is returned + tracking_info = p.next(PrioritizerFilter.GPU) + assert tracking_info.num_refs == 1 + assert "gpu-node" in tracking_info.hostname + + # we should step over the 3-th node on this iteration + tracking_info = p.next(PrioritizerFilter.CPU) + assert tracking_info.num_refs == 1 + assert "cpu-node" in tracking_info.hostname + + # and ensure that heap also steps over a direct increment + tracking_info = p.next(PrioritizerFilter.GPU) + assert tracking_info.num_refs == 1 + assert "gpu-node" in tracking_info.hostname + + # and another GPU request should return nothing + tracking_info = p.next(PrioritizerFilter.GPU) + assert tracking_info is None + + +def test_node_prioritizer_decrement_floor() -> None: + """Verify that repeatedly decrementing ref counts does not + allow negative ref counts""" + + num_cpu_nodes, num_gpu_nodes = 8, 4 + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes) + + lock = threading.RLock() + p = NodePrioritizer(nodes, lock) + + # try a ton of decrements on all the items in the prioritizer + for _ in range(len(nodes) * 100): + index = random.randint(0, num_cpu_nodes - 1) + p.decrement(cpu_hosts[index]) + + index = random.randint(0, num_gpu_nodes - 1) + p.decrement(gpu_hosts[index]) + + for node in nodes: + tracking_info = p.get_tracking_info(node.hostname) + assert tracking_info.num_refs == 0 + + +@pytest.mark.parametrize("num_requested", [1, 2, 3]) +def test_node_prioritizer_multi_increment_subheap(num_requested: int) -> None: + """Verify that retrieving multiple nodes via `next_n` API correctly + increments reference counts and returns appropriate results + when requesting an in-bounds number of nodes""" + + num_cpu_nodes, num_gpu_nodes = 8, 0 + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes) + + lock = threading.RLock() + p = NodePrioritizer(nodes, lock) + + # Mark some nodes as dirty to verify retrieval + p.increment(cpu_hosts[0]) + p.increment(cpu_hosts[2]) + p.increment(cpu_hosts[4]) + + hostnames = [cpu_hosts[0], cpu_hosts[1], cpu_hosts[2], cpu_hosts[3], cpu_hosts[5]] + + # request n == {num_requested} nodes from set of 3 available + all_tracking_info = p.next_n( + num_requested, + hosts=hostnames, + ) # <---- w/0,2,4 assigned, only 1,3,5 from hostnames can work + + # all parameterizations should result in a matching output size + assert len(all_tracking_info) == num_requested + + +def test_node_prioritizer_multi_increment_subheap_assigned() -> None: + """Verify that retrieving multiple nodes via `next_n` API does + not return anything when the number requested cannot be satisfied + by the given subheap due to prior assignment""" + + num_cpu_nodes, num_gpu_nodes = 8, 0 + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes) + + lock = threading.RLock() + p = NodePrioritizer(nodes, lock) + + # Mark some nodes as dirty to verify retrieval + p.increment(cpu_hosts[0]) + p.increment(cpu_hosts[2]) + + hostnames = [ + cpu_hosts[0], + "x" + cpu_hosts[2], + ] # <--- we can't get 2 from 1 valid node name + + # request n == {num_requested} nodes from set of 3 available + num_requested = 2 + all_tracking_info = p.next_n(num_requested, hosts=hostnames) + + # w/0,2 assigned, nothing can be returned + assert len(all_tracking_info) == 0 + + +def test_node_prioritizer_empty_subheap_next_w_hosts() -> None: + """Verify that retrieving multiple nodes via `next_n` API does + not allow an empty host list""" + + num_cpu_nodes, num_gpu_nodes = 8, 0 + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes) + + lock = threading.RLock() + p = NodePrioritizer(nodes, lock) + + # Mark some nodes as dirty to verify retrieval + p.increment(cpu_hosts[0]) + p.increment(cpu_hosts[2]) + + hostnames = [] + + # request n == {num_requested} nodes from set of 3 available + num_requested = 1 + with pytest.raises(ValueError) as ex: + p.next(hosts=hostnames) + + assert "No hostnames provided" == ex.value.args[0] + + +def test_node_prioritizer_empty_subheap_next_n_w_hosts() -> None: + """Verify that retrieving multiple nodes via `next_n` API does + not allow an empty host list""" + + num_cpu_nodes, num_gpu_nodes = 8, 0 + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes) + + lock = threading.RLock() + p = NodePrioritizer(nodes, lock) + + # Mark some nodes as dirty to verify retrieval + p.increment(cpu_hosts[0]) + p.increment(cpu_hosts[2]) + + hostnames = [] + + # request n == {num_requested} nodes from set of 3 available + num_requested = 1 + with pytest.raises(ValueError) as ex: + p.next_n(num_requested, hosts=hostnames) + + assert "No hostnames provided" == ex.value.args[0] + + +@pytest.mark.parametrize("num_requested", [-100, -1, 0]) +def test_node_prioritizer_empty_subheap_next_n(num_requested: int) -> None: + """Verify that retrieving a node via `next_n` API does + not allow a request with num_items < 1""" + + num_cpu_nodes, num_gpu_nodes = 8, 0 + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes) + + lock = threading.RLock() + p = NodePrioritizer(nodes, lock) + + # Mark some nodes as dirty to verify retrieval + p.increment(cpu_hosts[0]) + p.increment(cpu_hosts[2]) + + # request n == {num_requested} nodes from set of 3 available + with pytest.raises(ValueError) as ex: + p.next_n(num_requested) + + assert "Number of items requested" in ex.value.args[0] + + +@pytest.mark.parametrize("num_requested", [-100, -1, 0]) +def test_node_prioritizer_empty_subheap_next_n(num_requested: int) -> None: + """Verify that retrieving multiple nodes via `next_n` API does + not allow a request with num_items < 1""" + + num_cpu_nodes, num_gpu_nodes = 8, 0 + cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) + nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes) + + lock = threading.RLock() + p = NodePrioritizer(nodes, lock) + + # Mark some nodes as dirty to verify retrieval + p.increment(cpu_hosts[0]) + p.increment(cpu_hosts[2]) + + hostnames = [cpu_hosts[0], cpu_hosts[2]] + + # request n == {num_requested} nodes from set of 3 available + with pytest.raises(ValueError) as ex: + p.next_n(num_requested, hosts=hostnames) + + assert "Number of items requested" in ex.value.args[0] From 6d5518b539b19429f5443eb53a5355ef06cdcce2 Mon Sep 17 00:00:00 2001 From: Chris McBride <3595025+ankona@users.noreply.github.com> Date: Mon, 26 Aug 2024 16:50:42 -0400 Subject: [PATCH 19/60] fix init reordering bug (#675) Fix 3 bugs: 1. reordering the init sequence in the dragon backend resulted in an un-set collection being used 2. fix tests that should have been updated to compare set contents instead of individual items 3. remove newly added validation on empty host lists that broke existing tests --- .../_core/launcher/dragon/dragonBackend.py | 10 ++-- smartsim/_core/launcher/dragon/pqueue.py | 6 --- tests/test_dragon_run_request.py | 49 +++++++++++++++---- tests/test_node_prioritizer.py | 18 +++---- 4 files changed, 53 insertions(+), 30 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 2fda876462..6cf39be0fb 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -157,7 +157,6 @@ def __init__(self, pid: int) -> None: self._step_ids = (f"{create_short_id_str()}-{id}" for id in itertools.count()) """Incremental ID to assign to new steps prior to execution""" - self._initialize_hosts() self._queued_steps: "collections.OrderedDict[str, DragonRunRequest]" = ( collections.OrderedDict() ) @@ -188,11 +187,7 @@ def __init__(self, pid: int) -> None: else 5 ) """Time in seconds needed to server to complete shutdown""" - - self._view = DragonBackendView(self) - logger.debug(self._view.host_desc) self._infra_ddict: t.Optional[dragon_ddict.DDict] = None - self._prioritizer = NodePrioritizer(self._nodes, self._queue_lock) self._nodes: t.List["dragon_machine.Node"] = [] """Node capability information for hosts in the allocation""" @@ -205,6 +200,11 @@ def __init__(self, pid: int) -> None: self._allocated_hosts: t.Dict[str, t.Set[str]] = {} """Mapping with hostnames as keys and a set of running step IDs as the value""" + self._initialize_hosts() + self._view = DragonBackendView(self) + logger.debug(self._view.host_desc) + self._prioritizer = NodePrioritizer(self._nodes, self._queue_lock) + @property def hosts(self) -> list[str]: with self._queue_lock: diff --git a/smartsim/_core/launcher/dragon/pqueue.py b/smartsim/_core/launcher/dragon/pqueue.py index a9faf76b1e..8c14a828f5 100644 --- a/smartsim/_core/launcher/dragon/pqueue.py +++ b/smartsim/_core/launcher/dragon/pqueue.py @@ -143,9 +143,6 @@ def remove( :param tracking_id: a unique task identifier executing on the node to remove :raises ValueError: if tracking_id is already assigned to this node""" - if tracking_id and tracking_id not in self.assigned_tasks: - raise ValueError("Attempted removal of untracked item") - self._num_refs = max(self._num_refs - 1, 0) if tracking_id: self._assigned_tasks = self._assigned_tasks - {tracking_id} @@ -460,8 +457,5 @@ def next_n( :param hosts: a list of hostnames used to filter the available nodes :returns: Collection of reserved nodes :raises ValueError: if the hosts parameter is an empty list""" - if hosts is not None and not hosts: - raise ValueError("No hostnames provided") - heap = self._create_sub_heap(hosts, filter_on) return self._get_next_n_available_nodes(num_items, heap, tracking_id) diff --git a/tests/test_dragon_run_request.py b/tests/test_dragon_run_request.py index 5ff95f4087..62ac572eb2 100644 --- a/tests/test_dragon_run_request.py +++ b/tests/test_dragon_run_request.py @@ -176,7 +176,7 @@ def set_mock_group_infos( } monkeypatch.setattr(dragon_backend, "_group_infos", group_infos) - monkeypatch.setattr(dragon_backend, "_allocated_hosts", {hosts[0]: "abc123-1"}) + monkeypatch.setattr(dragon_backend, "_allocated_hosts", {hosts[0]: {"abc123-1"}}) monkeypatch.setattr(dragon_backend, "_running_steps", ["abc123-1"]) return group_infos @@ -221,8 +221,8 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend._running_steps == [step_id] assert len(dragon_backend._queued_steps) == 0 assert len(dragon_backend.free_hosts) == 1 - assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id - assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id + assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]] + assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]] monkeypatch.setattr( dragon_backend._group_infos[step_id].process_group, "status", "Running" @@ -233,8 +233,8 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend._running_steps == [step_id] assert len(dragon_backend._queued_steps) == 0 assert len(dragon_backend.free_hosts) == 1 - assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id - assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id + assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]] + assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]] dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED @@ -316,8 +316,8 @@ def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend._running_steps == [step_id] assert len(dragon_backend._queued_steps) == 0 assert len(dragon_backend._prioritizer.unassigned()) == 1 - assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id - assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id + assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]] + assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]] monkeypatch.setattr( dragon_backend._group_infos[step_id].process_group, "status", "Running" @@ -328,8 +328,8 @@ def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None: assert dragon_backend._running_steps == [step_id] assert len(dragon_backend._queued_steps) == 0 assert len(dragon_backend._prioritizer.unassigned()) == 1 - assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id - assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id + assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]] + assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]] dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED @@ -728,3 +728,34 @@ def test_can_honor_hosts_unavailable_hosts_ok(monkeypatch: pytest.MonkeyPatch) - assert can_honor, error_msg # confirm failure message indicates number of nodes requested as cause assert error_msg is None, error_msg + + +def test_can_honor_hosts_1_hosts_requested(monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that requesting nodes with invalid names causes number of available + nodes check to be reduced but still passes if enough valid named nodes are passed""" + dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0) + + # let's supply 2 valid and 1 invalid hostname + actual_hosts = list(dragon_backend._hosts) + actual_hosts[0] = f"x{actual_hosts[0]}" + + host_list = ",".join(actual_hosts) + + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=1, # <----- requesting 0 nodes - should be ignored + hostlist=host_list, # <--- two valid names are available + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + policy=DragonRunPolicy(), + ) + + can_honor, error_msg = dragon_backend._can_honor(run_req) + + # confirm the failure is indicated + assert can_honor, error_msg diff --git a/tests/test_node_prioritizer.py b/tests/test_node_prioritizer.py index abb4624b6a..905c0ecc90 100644 --- a/tests/test_node_prioritizer.py +++ b/tests/test_node_prioritizer.py @@ -457,9 +457,9 @@ def test_node_prioritizer_multi_increment_subheap_assigned() -> None: assert len(all_tracking_info) == 0 -def test_node_prioritizer_empty_subheap_next_w_hosts() -> None: +def test_node_prioritizer_empty_subheap_next_w_no_hosts() -> None: """Verify that retrieving multiple nodes via `next_n` API does - not allow an empty host list""" + with an empty host list uses the entire available host list""" num_cpu_nodes, num_gpu_nodes = 8, 0 cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) @@ -476,15 +476,15 @@ def test_node_prioritizer_empty_subheap_next_w_hosts() -> None: # request n == {num_requested} nodes from set of 3 available num_requested = 1 - with pytest.raises(ValueError) as ex: - p.next(hosts=hostnames) + node = p.next(hosts=hostnames) + assert node - assert "No hostnames provided" == ex.value.args[0] + # assert "No hostnames provided" == ex.value.args[0] def test_node_prioritizer_empty_subheap_next_n_w_hosts() -> None: """Verify that retrieving multiple nodes via `next_n` API does - not allow an empty host list""" + not blow up with an empty host list""" num_cpu_nodes, num_gpu_nodes = 8, 0 cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes) @@ -501,10 +501,8 @@ def test_node_prioritizer_empty_subheap_next_n_w_hosts() -> None: # request n == {num_requested} nodes from set of 3 available num_requested = 1 - with pytest.raises(ValueError) as ex: - p.next_n(num_requested, hosts=hostnames) - - assert "No hostnames provided" == ex.value.args[0] + node = p.next_n(num_requested, hosts=hostnames) + assert node is not None @pytest.mark.parametrize("num_requested", [-100, -1, 0]) From 5d85995f5153e3e88de9b4b218cbae7dfb3e1cc9 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 28 Aug 2024 10:19:48 +0200 Subject: [PATCH 20/60] Queue-based Worker Manager (#647) This PR adds the `RequestDispatcher` to the MLI. The `RequestDispatcher` batches inference requests together and dispatches batches to `WorkerManagers`. [ committed by @al-rigazzi ] [ reviewed by @mellis13 @ankona @AlyssaCote ] --- doc/changelog.md | 1 + ex/high_throughput_inference/mli_driver.py | 34 +- ex/high_throughput_inference/mock_app.py | 136 ++--- .../mock_app_redis.py | 28 +- ex/high_throughput_inference/redis_driver.py | 15 +- .../standalone_workermanager.py | 146 ++++- setup.py | 2 +- smartsim/_core/entrypoints/service.py | 17 - .../_core/launcher/dragon/dragonBackend.py | 5 +- .../_core/mli/comm/channel/dragonchannel.py | 6 +- smartsim/_core/mli/comm/channel/dragonfli.py | 4 +- .../mli/infrastructure/control/__init__.py | 0 .../infrastructure/control/devicemanager.py | 146 +++++ .../infrastructure/control/error_handling.py | 70 +++ .../control/requestdispatcher.py | 504 ++++++++++++++++++ .../infrastructure/control/workermanager.py | 427 ++++++--------- .../mli/infrastructure/worker/torch_worker.py | 163 ++++-- .../_core/mli/infrastructure/worker/worker.py | 218 +++++--- .../_core/mli/mli_schemas/model/__init__.py | 0 smartsim/_core/utils/timings.py | 143 +++++ .../test_core_machine_learning_worker.py | 99 ++-- tests/dragon/test_device_manager.py | 185 +++++++ tests/dragon/test_error_handling.py | 288 ++++++++-- tests/dragon/test_request_dispatcher.py | 331 ++++++++++++ tests/{mli => dragon}/test_torch_worker.py | 99 +++- tests/dragon/test_worker_manager.py | 14 +- 26 files changed, 2426 insertions(+), 655 deletions(-) create mode 100644 smartsim/_core/mli/infrastructure/control/__init__.py create mode 100644 smartsim/_core/mli/infrastructure/control/devicemanager.py create mode 100644 smartsim/_core/mli/infrastructure/control/error_handling.py create mode 100644 smartsim/_core/mli/infrastructure/control/requestdispatcher.py create mode 100644 smartsim/_core/mli/mli_schemas/model/__init__.py create mode 100644 smartsim/_core/utils/timings.py rename tests/{mli => dragon}/test_core_machine_learning_worker.py (80%) create mode 100644 tests/dragon/test_device_manager.py create mode 100644 tests/dragon/test_request_dispatcher.py rename tests/{mli => dragon}/test_torch_worker.py (61%) diff --git a/doc/changelog.md b/doc/changelog.md index 964e62b49d..ac09ecf604 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Add RequestDispatcher and the possibility of batching inference requests - Enable hostname selection for dragon tasks - Remove pydantic dependency from MLI code - Update MLI environment variables using new naming convention diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 0cf87ef2e2..807a70b219 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -1,4 +1,3 @@ -import argparse import os import base64 import cloudpickle @@ -6,14 +5,17 @@ from smartsim import Experiment from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim.status import TERMINAL_STATUSES +from smartsim.settings import DragonRunSettings import time import typing as t -device = "gpu" +DEVICE = "gpu" +NUM_RANKS = 4 +NUM_WORKERS = 1 filedir = os.path.dirname(__file__) worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") app_script_name = os.path.join(filedir, "mock_app.py") -model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") +model_name = os.path.join(filedir, f"resnet50.{DEVICE}.pt") transport: t.Literal["hsta", "tcp"] = "hsta" @@ -25,37 +27,51 @@ torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii") -worker_manager_rs = exp.create_run_settings( +worker_manager_rs: DragonRunSettings = exp.create_run_settings( sys.executable, [ worker_manager_script_name, "--device", - device, + DEVICE, "--worker_class", torch_worker_str, + "--batch_size", + str(NUM_RANKS//NUM_WORKERS), + "--batch_timeout", + str(0.00), + "--num_workers", + str(NUM_WORKERS) ], ) + +aff = [] + +worker_manager_rs.set_cpu_affinity(aff) + worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) -app_rs = exp.create_run_settings( +app_rs: DragonRunSettings = exp.create_run_settings( sys.executable, - exe_args=[app_script_name, "--device", device], + exe_args=[app_script_name, "--device", DEVICE, "--log_max_batchsize", str(6)], ) +app_rs.set_tasks_per_node(NUM_RANKS) + + app = exp.create_model("app", run_settings=app_rs) app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) - exp.generate(worker_manager, app, overwrite=True) exp.start(worker_manager, app, block=False) while True: if exp.get_status(app)[0] in TERMINAL_STATUSES: + time.sleep(10) exp.stop(worker_manager) break if exp.get_status(worker_manager)[0] in TERMINAL_STATUSES: + time.sleep(10) exp.stop(app) break - time.sleep(5) print("Exiting.") diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 44db70b71d..517d18fb2f 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -41,20 +41,27 @@ import os import time import torch -import numbers -from collections import OrderedDict +from mpi4py import MPI from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( DragonFeatureStore, ) from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger +from smartsim._core.utils.timings import PerfTimer + +torch.set_num_interop_threads(16) +torch.set_num_threads(1) logger = get_logger("App") +logger.info("Started app") +CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False class ProtoClient: def __init__(self, timing_on: bool): + comm = MPI.COMM_WORLD + rank = comm.Get_rank() connect_to_infrastructure() ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] self._ddict = DDict.attach(ddict_str) @@ -70,61 +77,15 @@ def __init__(self, timing_on: bool): self._from_worker_ch_serialized = self._from_worker_ch.serialize() self._to_worker_ch = Channel.make_process_local() - self._start = None - self._interm = None - self._timings: OrderedDict[str, list[numbers.Number]] = OrderedDict() - self._timing_on = timing_on - - def _add_label_to_timings(self, label: str): - if label not in self._timings: - self._timings[label] = [] - - @staticmethod - def _format_number(number: numbers.Number): - return f"{number:0.4e}" - - def start_timings(self, batch_size: int): - if self._timing_on: - self._add_label_to_timings("batch_size") - self._timings["batch_size"].append(batch_size) - self._start = time.perf_counter() - self._interm = time.perf_counter() - - def end_timings(self): - if self._timing_on: - self._add_label_to_timings("total_time") - self._timings["total_time"].append( - self._format_number(time.perf_counter() - self._start) - ) - - def measure_time(self, label: str): - if self._timing_on: - self._add_label_to_timings(label) - self._timings[label].append( - self._format_number(time.perf_counter() - self._interm) - ) - self._interm = time.perf_counter() - - def print_timings(self, to_file: bool = False): - print(" ".join(self._timings.keys())) - value_array = numpy.array( - [value for value in self._timings.values()], dtype=float - ) - value_array = numpy.transpose(value_array) - for i in range(value_array.shape[0]): - print(" ".join(self._format_number(value) for value in value_array[i])) - if to_file: - numpy.save("timings.npy", value_array) - numpy.savetxt("timings.txt", value_array) + self.perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{rank}_") def run_model(self, model: bytes | str, batch: torch.Tensor): tensors = [batch.numpy()] - self.start_timings(batch.shape[0]) + self.perf_timer.start_timings("batch_size", batch.shape[0]) built_tensor_desc = MessageHandler.build_tensor_descriptor( "c", "float32", list(batch.shape) ) - self.measure_time("build_tensor_descriptor") - built_model = None + self.perf_timer.measure_time("build_tensor_descriptor") if isinstance(model, str): model_arg = MessageHandler.build_model_key(model, self._backbone_descriptor) else: @@ -137,39 +98,39 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): output_descriptors=[], custom_attributes=None, ) - self.measure_time("build_request") + self.perf_timer.measure_time("build_request") request_bytes = MessageHandler.serialize_request(request) - self.measure_time("serialize_request") - with self._to_worker_fli.sendh( - timeout=None, stream_channel=self._to_worker_ch - ) as to_sendh: + self.perf_timer.measure_time("serialize_request") + with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) - for t in tensors: - to_sendh.send_bytes(t.tobytes()) # TODO NOT FAST ENOUGH!!! - # to_sendh.send_bytes(bytes(t.data)) - logger.info(f"Message size: {len(request_bytes)} bytes") - - self.measure_time("send") + self.perf_timer.measure_time("send_request") + for tensor in tensors: + to_sendh.send_bytes(tensor.tobytes()) #TODO NOT FAST ENOUGH!!! + self.perf_timer.measure_time("send_tensors") with self._from_worker_ch.recvh(timeout=None) as from_recvh: resp = from_recvh.recv_bytes(timeout=None) - self.measure_time("receive") + self.perf_timer.measure_time("receive_response") response = MessageHandler.deserialize_response(resp) - self.measure_time("deserialize_response") + self.perf_timer.measure_time("deserialize_response") # list of data blobs? recv depending on the len(response.result.descriptors)? - data_blob = from_recvh.recv_bytes(timeout=None) - result = numpy.frombuffer( - data_blob, - dtype=str(response.result.descriptors[0].dataType), + data_blob: bytes = from_recvh.recv_bytes(timeout=None) + self.perf_timer.measure_time("receive_tensor") + result = torch.from_numpy( + numpy.frombuffer( + data_blob, + dtype=str(response.result.descriptors[0].dataType), + ) ) - self.measure_time("deserialize_tensor") + self.perf_timer.measure_time("deserialize_tensor") - self.end_timings() + self.perf_timer.end_timings() return result def set_model(self, key: str, model: bytes): self._ddict[key] = model + class ResNetWrapper: def __init__(self, name: str, model: str): self._model = torch.jit.load(model) @@ -190,24 +151,39 @@ def model(self): def name(self): return self._name - if __name__ == "__main__": parser = argparse.ArgumentParser("Mock application") - parser.add_argument("--device", default="cpu") + parser.add_argument("--device", default="cpu", type=str) + parser.add_argument("--log_max_batchsize", default=8, type=int) args = parser.parse_args() - resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt") + resnet = ResNetWrapper("resnet50", f"resnet50.{args.device}.pt") client = ProtoClient(timing_on=True) client.set_model(resnet.name, resnet.model) - total_iterations = 100 + if CHECK_RESULTS_AND_MAKE_ALL_SLOWER: + # TODO: adapt to non-Nvidia devices + torch_device = args.device.replace("gpu", "cuda") + pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to(torch_device) - for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]: - logger.info(f"Batch size: {batch_size}") - for iteration_number in range(total_iterations + int(batch_size == 1)): - logger.info(f"Iteration: {iteration_number}") - client.run_model(resnet.name, resnet.get_batch(batch_size)) + TOTAL_ITERATIONS = 100 - client.print_timings(to_file=True) + for log2_bsize in range(args.log_max_batchsize+1): + b_size: int = 2**log2_bsize + logger.info(f"Batch size: {b_size}") + for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)): + logger.info(f"Iteration: {iteration_number}") + sample_batch = resnet.get_batch(b_size) + remote_result = client.run_model(resnet.name, sample_batch) + logger.info(client.perf_timer.get_last("total_time")) + if CHECK_RESULTS_AND_MAKE_ALL_SLOWER: + local_res = pt_model(sample_batch.to(torch_device)) + err_norm = torch.linalg.vector_norm(torch.flatten(remote_result).to(torch_device)-torch.flatten(local_res), ord=1).cpu() + res_norm = torch.linalg.vector_norm(remote_result, ord=1).item() + local_res_norm = torch.linalg.vector_norm(local_res, ord=1).item() + logger.info(f"Avg norm of error {err_norm.item()/b_size} compared to result norm of {res_norm/b_size}:{local_res_norm/b_size}") + torch.cuda.synchronize() + + client.perf_timer.print_timings(to_file=True) \ No newline at end of file diff --git a/ex/high_throughput_inference/mock_app_redis.py b/ex/high_throughput_inference/mock_app_redis.py index c56b4fb8b4..8978bcea23 100644 --- a/ex/high_throughput_inference/mock_app_redis.py +++ b/ex/high_throughput_inference/mock_app_redis.py @@ -29,7 +29,9 @@ import numpy import time import torch +from mpi4py import MPI from smartsim.log import get_logger +from smartsim._core.utils.timings import PerfTimer from smartredis import Client logger = get_logger("App") @@ -56,6 +58,9 @@ def name(self): if __name__ == "__main__": + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + parser = argparse.ArgumentParser("Mock application") parser.add_argument("--device", default="cpu") args = parser.parse_args() @@ -65,24 +70,21 @@ def name(self): client = Client(cluster=False, address=None) client.set_model(resnet.name, resnet.model, backend='TORCH', device=args.device.upper()) + perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"redis{rank}_") + total_iterations = 100 timings=[] for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]: logger.info(f"Batch size: {batch_size}") for iteration_number in range(total_iterations + int(batch_size==1)): - timing = [batch_size] + perf_timer.start_timings("batch_size", batch_size) logger.info(f"Iteration: {iteration_number}") - start = time.perf_counter() - client.put_tensor(name="batch", data=resnet.get_batch(batch_size).numpy()) - client.run_model(name=resnet.name, inputs=["batch"], outputs=["result"]) - result = client.get_tensor(name="result") - end = time.perf_counter() - timing.append(end-start) - timings.append(timing) - + input_name = f"batch_{rank}" + output_name = f"result_{rank}" + client.put_tensor(name=input_name, data=resnet.get_batch(batch_size).numpy()) + client.run_model(name=resnet.name, inputs=[input_name], outputs=[output_name]) + result = client.get_tensor(name=output_name) + perf_timer.end_timings() - timings_np = numpy.asarray(timings) - numpy.save("timings.npy", timings_np) - for timing in timings: - print(" ".join(str(t) for t in timing)) + perf_timer.print_timings(True) diff --git a/ex/high_throughput_inference/redis_driver.py b/ex/high_throughput_inference/redis_driver.py index ceddba4ef7..ff57725d40 100644 --- a/ex/high_throughput_inference/redis_driver.py +++ b/ex/high_throughput_inference/redis_driver.py @@ -29,23 +29,24 @@ from smartsim import Experiment from smartsim.status import TERMINAL_STATUSES import time -import typing as t -device = "gpu" +DEVICE = "gpu" filedir = os.path.dirname(__file__) app_script_name = os.path.join(filedir, "mock_app_redis.py") -model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") +model_name = os.path.join(filedir, f"resnet50.{DEVICE}.pt") -exp_path = os.path.join(filedir, "redis_ai") +exp_path = os.path.join(filedir, "redis_ai_multi") os.makedirs(exp_path, exist_ok=True) -exp = Experiment("redis_ai", launcher="slurm", exp_path=exp_path) +exp = Experiment("redis_ai_multi", launcher="slurm", exp_path=exp_path) db = exp.create_database(interface="hsn0") -app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) +app_rs = exp.create_run_settings( + sys.executable, exe_args = [app_script_name, "--device", DEVICE] + ) app_rs.set_nodes(1) -app_rs.set_tasks(1) +app_rs.set_tasks(4) app = exp.create_model("app", run_settings=app_rs) app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 982cb6cc38..0b8c61251b 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -24,28 +24,90 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# isort: off + import dragon + +# pylint disable=import-error +import dragon.infrastructure.policy as dragon_policy +import dragon.infrastructure.process_desc as dragon_process_desc +import dragon.native.process as dragon_process from dragon import fli from dragon.channels import Channel from dragon.data.ddict.ddict import DDict -from dragon.utils import b64decode, b64encode from dragon.globalservices.api_setup import connect_to_infrastructure +from dragon.managed_memory import MemoryPool +from dragon.utils import b64decode, b64encode +# pylint enable=import-error +# isort: off # isort: on + import argparse import base64 +import multiprocessing as mp +import os +import pickle +import socket +import sys +import time +import typing as t + import cloudpickle import optparse import os +from smartsim._core.entrypoints.service import Service +from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( DragonFeatureStore, ) +from smartsim._core.mli.infrastructure.control.requestdispatcher import ( + RequestDispatcher, +) from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( + DragonFeatureStore, +) +from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase + +from smartsim.log import get_logger + +logger = get_logger("Worker Manager Entry Point") + +mp.set_start_method("dragon") + +pid = os.getpid() +affinity = os.sched_getaffinity(pid) +logger.info(f"Entry point: {socket.gethostname()}, {affinity}") +logger.info(f"CPUS: {os.cpu_count()}") + + + +def service_as_dragon_proc( + service: Service, cpu_affinity: list[int], gpu_affinity: list[int] +) -> dragon_process.Process: + + options = dragon_process_desc.ProcessOptions(make_inf_channels=True) + local_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=socket.gethostname(), + cpu_affinity=cpu_affinity, + gpu_affinity=gpu_affinity, + ) + return dragon_process.Process( + target=service.execute, + args=[], + cwd=os.getcwd(), + policy=local_policy, + options=options, + stderr=dragon_process.Popen.STDOUT, + stdout=dragon_process.Popen.STDOUT, + ) + + if __name__ == "__main__": @@ -66,8 +128,20 @@ parser.add_argument( "--num_workers", type=int, default=1, help="Number of workers to run" ) - + parser.add_argument( + "--batch_size", + type=int, + default=1, + help="How many requests the workers will try to aggregate before processing them", + ) + parser.add_argument( + "--batch_timeout", + type=float, + default=0.001, + help="How much time (in seconds) should be waited before processing an incomplete aggregated request", + ) args = parser.parse_args() + connect_to_infrastructure() ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] ddict = DDict.attach(ddict_str) @@ -77,8 +151,12 @@ to_worker_fli_serialized = to_worker_fli.serialize() ddict["to_worker_fli"] = to_worker_fli_serialized - worker_type_name = base64.b64decode(args.worker_class.encode("ascii")) - torch_worker = cloudpickle.loads(worker_type_name)() + arg_worker_type = cloudpickle.loads( + base64.b64decode(args.worker_class.encode("ascii")) + ) + + dfs = DragonFeatureStore(ddict) + comm_channel = DragonFLIChannel(to_worker_fli_serialized) descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor @@ -89,11 +167,57 @@ queue_factory=DragonFLIChannel.from_descriptor, ) - worker_manager = WorkerManager( + dispatcher = RequestDispatcher( + batch_timeout=args.batch_timeout, + batch_size=args.batch_size, config_loader=config_loader, - worker=torch_worker, - as_service=True, - cooldown=10, - device=args.device, + worker_type=arg_worker_type, ) - worker_manager.execute() + + wms = [] + worker_device = args.device + for wm_idx in range(args.num_workers): + + worker_manager = WorkerManager( + config_loader=config_loader, + worker_type=arg_worker_type, + as_service=True, + cooldown=10, + device=worker_device, + dispatcher_queue=dispatcher.task_queue, + ) + + wms.append(worker_manager) + + wm_affinity: list[int] = [] + disp_affinity: list[int] = [] + + # This is hardcoded for a specific type of node: + # the GPU-to-CPU mapping is taken from the nvidia-smi tool + # TODO can this be computed on the fly? + gpu_to_cpu_aff: dict[int, list[int]] = {} + gpu_to_cpu_aff[0] = list(range(48,64)) + list(range(112,128)) + gpu_to_cpu_aff[1] = list(range(32,48)) + list(range(96,112)) + gpu_to_cpu_aff[2] = list(range(16,32)) + list(range(80,96)) + gpu_to_cpu_aff[3] = list(range(0,16)) + list(range(64,80)) + + worker_manager_procs = [] + for worker_idx in range(args.num_workers): + wm_cpus = len(gpu_to_cpu_aff[worker_idx]) - 4 + wm_affinity = gpu_to_cpu_aff[worker_idx][:wm_cpus] + disp_affinity.extend(gpu_to_cpu_aff[worker_idx][wm_cpus:]) + worker_manager_procs.append(service_as_dragon_proc( + worker_manager, cpu_affinity=wm_affinity, gpu_affinity=[worker_idx] + )) + + dispatcher_proc = service_as_dragon_proc(dispatcher, cpu_affinity=disp_affinity, gpu_affinity=[]) + + # TODO: use ProcessGroup and restart=True? + all_procs = [dispatcher_proc, *worker_manager_procs] + + print(f"Dispatcher proc: {dispatcher_proc}") + for proc in all_procs: + proc.start() + + while all(proc.is_alive for proc in all_procs): + time.sleep(1) diff --git a/setup.py b/setup.py index 512da78de9..709913eda8 100644 --- a/setup.py +++ b/setup.py @@ -177,7 +177,7 @@ class BuildError(Exception): "filelock>=3.4.2", "protobuf~=3.20", "jinja2>=3.1.2", - "watchdog>=4.0.0", + "watchdog>=4.0.0,<5", "pycapnp==2.0.0", "pydantic==1.10.14", "pyzmq>=25.1.2", diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py index df9c2bbef6..6b4ef74b67 100644 --- a/smartsim/_core/entrypoints/service.py +++ b/smartsim/_core/entrypoints/service.py @@ -103,23 +103,6 @@ def execute(self) -> None: running = True cooldown_start: t.Optional[datetime.datetime] = None - headers = [ - "batch_size", - "w_deserialize", - "w_fetch_model", - "w_load_model", - "w_fetch_input", - "w_transform_input", - "w_execute", - "w_transform_output", - "w_assign_output", - "w_build_reply", - "w_serialize_resp", - "w_send", - ] - - print(",".join(headers)) - while running: self._on_iteration() diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 6cf39be0fb..7526af14ad 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -605,10 +605,7 @@ def _start_steps(self) -> None: logger.debug(f"Step id {step_id} allocated on {hosts}") - global_policy = dragon_policy.Policy( - placement=dragon_policy.Policy.Placement.HOST_NAME, - host_name=hosts[0], - ) + global_policy = self.create_run_policy(request, hosts[0]) options = dragon_process_desc.ProcessOptions(make_inf_channels=True) grp = dragon_process_group.ProcessGroup( restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index 80fdd9cdc6..89b90f2e62 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -33,11 +33,7 @@ logger = get_logger(__name__) -try: - import dragon.channels as dch -except ImportError as exc: - if not "pytest" in sys.modules: - raise exc from None +import dragon.channels as dch class DragonCommChannel(cch.CommChannelBase): diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 4636894bdd..130c5cf5eb 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -68,12 +68,12 @@ def recv(self) -> t.List[bytes]: :returns: the received message""" messages = [] eot = False - with self._fli.recvh(timeout=None) as recvh: + with self._fli.recvh(timeout=0.001) as recvh: while not eot: try: message, _ = recvh.recv_bytes(timeout=None) messages.append(message) - except fli.FLIEOT as exc: + except fli.FLIEOT: eot = True return messages diff --git a/smartsim/_core/mli/infrastructure/control/__init__.py b/smartsim/_core/mli/infrastructure/control/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py new file mode 100644 index 0000000000..3570bd51ed --- /dev/null +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -0,0 +1,146 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t +from contextlib import _GeneratorContextManager, contextmanager + +from .....log import get_logger +from ...infrastructure.storage.featurestore import FeatureStore +from ..worker.worker import MachineLearningWorkerBase, RequestBatch + +logger = get_logger(__name__) + + +class WorkerDevice: + def __init__(self, name: str) -> None: + """Wrapper around a device to keep track of loaded Models and availability + :param name: name used by the toolkit to identify this device, e.g. ``cuda:0`` + """ + self._name = name + """The name used by the toolkit to identify this device""" + self._models: dict[str, t.Any] = {} + """Dict of keys to models which are loaded on this device""" + + @property + def name(self) -> str: + """The identifier of the device represented by this object""" + return self._name + + def add_model(self, key: str, model: t.Any) -> None: + """Add a reference to a model loaded on this device and assign it a key + + :param key: The key under which the model is saved + :param model: The model which is added + """ + self._models[key] = model + + def remove_model(self, key: str) -> None: + """Remove the reference to a model loaded on this device + + :param key: The key of the model to remove + """ + self._models.pop(key) + + def get_model(self, key: str) -> t.Any: + """Get the model corresponding to a given key + + :param key: the model key + :returns: the model for the given key + """ + return self._models[key] + + def __contains__(self, key: str) -> bool: + """Check if model with a given key is available on the device + + :param key: the key of the model to check for existence + :returns: whether the model is available on the device + """ + return key in self._models + + @contextmanager + def get(self, key_to_remove: t.Optional[str]) -> t.Iterator["WorkerDevice"]: + yield self + if key_to_remove is not None: + self.remove_model(key_to_remove) + + +class DeviceManager: + def __init__(self, device: WorkerDevice): + """An object to manage devices such as GPUs and CPUs. + + The main goal of the ``DeviceManager`` is to ensure that + the managed device is ready to be used by a worker to + run a given model + :param device: The managed device + """ + self._device = device + """Device managed by this object""" + + def _load_model_on_device( + self, + worker: MachineLearningWorkerBase, + batch: RequestBatch, + feature_stores: dict[str, FeatureStore], + ) -> None: + """Load the model needed to execute on a batch on the managed device. + + The model is loaded by the worker. + + :param worker: the worker that loads the model + :param batch: the batch for which the model is needed + :param feature_stores: feature stores where the model could be stored + """ + + model_bytes = worker.fetch_model(batch, feature_stores) + loaded_model = worker.load_model(batch, model_bytes, self._device.name) + self._device.add_model(batch.model_id.key, loaded_model.model) + + def get_device( + self, + worker: MachineLearningWorkerBase, + batch: RequestBatch, + feature_stores: dict[str, FeatureStore], + ) -> _GeneratorContextManager[WorkerDevice]: + """Get the device managed by this object + + the model needed to run the batch of requests is + guaranteed to be available on the model + + :param worker: The worker that wants to access the device + :param batch: The batch of requests + :param feature_store: The feature store on which part of the + data needed by the request may be stored + :return: A generator yielding the device + """ + model_in_request = batch.has_raw_model + + # Load model if not already loaded, or + # because it is sent with the request + if model_in_request or not batch.model_id.key in self._device: + self._load_model_on_device(worker, batch, feature_stores) + + key_to_remove = batch.model_id.key if model_in_request else None + return self._device.get(key_to_remove) diff --git a/smartsim/_core/mli/infrastructure/control/error_handling.py b/smartsim/_core/mli/infrastructure/control/error_handling.py new file mode 100644 index 0000000000..e2c5bcd9e1 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/control/error_handling.py @@ -0,0 +1,70 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + +from .....log import get_logger +from ...comm.channel.channel import CommChannelBase +from ...message_handler import MessageHandler +from ...mli_schemas.response.response_capnp import ResponseBuilder + +if t.TYPE_CHECKING: + from smartsim._core.mli.mli_schemas.response.response_capnp import Status + +logger = get_logger(__file__) + + +def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: + return MessageHandler.build_response( + status=status, + message=message, + result=[], + custom_attributes=None, + ) + + +def exception_handler( + exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str +) -> None: + """ + Logs exceptions and sends a failure response. + + :param exc: The exception to be logged + :param reply_channel: The channel used to send replies + :param failure_message: Failure message to log and send back + """ + logger.exception( + f"{failure_message}\n" + f"Exception type: {type(exc).__name__}\n" + f"Exception message: {str(exc)}" + ) + serialized_resp = MessageHandler.serialize_response( + build_failure_reply("fail", failure_message) + ) + if reply_channel: + reply_channel.send(serialized_resp) + else: + logger.warning("Unable to notify client of error without reply_channel") diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py new file mode 100644 index 0000000000..d56912a8f0 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -0,0 +1,504 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# pylint: disable=import-error +# pylint: disable-next=unused-import +import dragon +import dragon.globalservices.pool as dragon_gs_pool +from dragon.managed_memory import MemoryPool +from dragon.mpbridge.queues import DragonQueue + +# pylint: enable=import-error + +# isort: off +# isort: on + +import multiprocessing as mp +import time +import typing as t +import uuid +from queue import Empty, Full, Queue + +from smartsim._core.entrypoints.service import Service + +from .....error import SmartSimError +from .....log import get_logger +from ....utils.timings import PerfTimer +from ...infrastructure.environmentloader import EnvironmentConfigLoader +from ...infrastructure.storage.featurestore import FeatureStore +from ...infrastructure.worker.worker import ( + InferenceRequest, + MachineLearningWorkerBase, + ModelIdentifier, + RequestBatch, +) +from .error_handling import exception_handler + +if t.TYPE_CHECKING: + from smartsim._core.mli.mli_schemas.response.response_capnp import Status + +logger = get_logger("Request Dispatcher") + + +class BatchQueue(Queue[InferenceRequest]): + def __init__( + self, batch_timeout: float, batch_size: int, model_id: ModelIdentifier + ) -> None: + """Queue used to store inference requests waiting to be batched and + sent to Worker Managers. + :param batch_timeout: Time in seconds that has to be waited before flushing a + non-full queue. The time of the first item put is 0 seconds. + :param batch_size: Total capacity of the queue. + :param model_id: Key of the model which needs to be executed on the queued + requests + """ + super().__init__(maxsize=batch_size) + self._batch_timeout = batch_timeout + """Time in seconds that has to be waited before flushing a non-full queue. + The time of the first item put is 0 seconds.""" + self._batch_size = batch_size + """Total capacity of the queue.""" + self._first_put: t.Optional[float] = None + """Time at which the first item was put on the queue""" + self._disposable = False + """Whether the queue will not be used again and can be deleted. + A disposable queue is always full.""" + self._model_id: ModelIdentifier = model_id + """Key of the model which needs to be executed on the queued requests""" + self._uid = str(uuid.uuid4()) + """Unique ID of queue""" + + @property + def uid(self) -> str: + """ID of this queue""" + return self._uid + + @property + def model_id(self) -> ModelIdentifier: + """Key of the model which needs to be run on the queued requests""" + return self._model_id + + def put( + self, + item: InferenceRequest, + block: bool = False, + timeout: t.Optional[float] = 0.0, + ) -> None: + """Put an inference request in the queue + :param item: The request + :param block: Whether to block when trying to put the item + :param timeout: Time (in seconds) to wait if block==True + :raises Full: If an item cannot be put on the queue + """ + super().put(item, block=block, timeout=timeout) + if self._first_put is None: + self._first_put = time.time() + + @property + def _elapsed_time(self) -> float: + """Time elapsed since the first item was put on this queue""" + if self.empty() or self._first_put is None: + return 0 + return time.time() - self._first_put + + @property + def ready(self) -> bool: + """True if the queue can be flushed""" + if self.empty(): + return False + + timed_out = ( + self._batch_timeout > 0 and self._elapsed_time >= self._batch_timeout + ) + logger.debug(f"Is full: {self.full()} or has timed out: {timed_out}") + return self.full() or timed_out + + def make_disposable(self) -> None: + """Set this queue as disposable, and never use it again after it gets flushed""" + self._disposable = True + + @property + def can_be_removed(self) -> bool: + """Whether this queue can be deleted and garbage collected""" + return self.empty() and self._disposable + + def flush(self) -> list[t.Any]: + """Get all requests from queue + :return: Requests waiting to be executed + """ + num_items = self.qsize() + self._first_put = None + items = [] + for _ in range(num_items): + try: + items.append(self.get()) + except Empty: + break + + return items + + def full(self) -> bool: + """Return True if the queue has reached its maximum capacity""" + if self._disposable: + return True + return self.qsize() >= self._batch_size + + def empty(self) -> bool: + """Return True if the queue has 0 elements""" + return self.qsize() == 0 + + +class RequestDispatcher(Service): + def __init__( + self, + batch_timeout: float, + batch_size: int, + config_loader: EnvironmentConfigLoader, + worker_type: t.Type[MachineLearningWorkerBase], + mem_pool_size: int = 2 * 1024**3, + ) -> None: + """The RequestDispatcher intercepts inference requests, stages them in + queues and batches them together before making them available to Worker + Managers. + :param batch_timeout: Maximum elapsed time before flushing a complete or + incomplete batch + :param batch_size: Total capacity of each batch queue. + :param mem_pool: Memory pool used to share batched input tensors with worker + managers + :param config_loader: Object to load configuration from environment + :param worker_type: Type of worker to instantiate to batch inputs + :param mem_pool_size: Size of the memory pool used to allocate tensors + :raises SmartSimError: If config_loaded.get_queue() does not return a channel + """ + super().__init__(as_service=True, cooldown=1) + self._queues: dict[str, list[BatchQueue]] = {} + """Dict of all batch queues available for a given model id""" + self._active_queues: dict[str, BatchQueue] = {} + """Mapping telling which queue is the recipient of requests for a given model + key""" + self._batch_timeout = batch_timeout + """Time in seconds that has to be waited before flushing a non-full queue""" + self._batch_size = batch_size + """Total capacity of each batch queue.""" + incoming_channel = config_loader.get_queue() + if incoming_channel is None: + raise SmartSimError("No incoming channel for dispatcher") + self._incoming_channel = incoming_channel + """The channel the dispatcher monitors for new tasks""" + self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0) + """The queue on which batched inference requests are placed""" + self._feature_stores: t.Dict[str, FeatureStore] = {} + """A collection of attached feature stores""" + self._featurestore_factory = config_loader._featurestore_factory + """A factory method to create a desired feature store client type""" + self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone() + """A standalone, system-created feature store used to share internal + information among MLI components""" + self._callback_factory = config_loader._callback_factory + """The type of communication channel to construct for callbacks""" + self._worker = worker_type() + """The worker used to batch inputs""" + self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(mem_pool_size).sdesc) + """Memory pool used to share batched input tensors with the Worker Managers""" + self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True) + """Performance timer""" + + def _check_feature_stores(self, request: InferenceRequest) -> bool: + """Ensures that all feature stores required by the request are available + + :param request: The request to validate + :returns: False if feature store validation fails for the request, True + otherwise + """ + # collect all feature stores required by the request + fs_model: t.Set[str] = set() + if request.model_key: + fs_model = {request.model_key.descriptor} + fs_inputs = {key.descriptor for key in request.input_keys} + fs_outputs = {key.descriptor for key in request.output_keys} + + # identify which feature stores are requested and unknown + fs_desired = fs_model.union(fs_inputs).union(fs_outputs) + fs_actual = {item.descriptor for item in self._feature_stores.values()} + fs_missing = fs_desired - fs_actual + + if self._featurestore_factory is None: + logger.error("No feature store factory configured") + return False + + # create the feature stores we need to service request + if fs_missing: + logger.debug(f"Adding feature store(s): {fs_missing}") + for descriptor in fs_missing: + feature_store = self._featurestore_factory(descriptor) + self._feature_stores[descriptor] = feature_store + + return True + + # pylint: disable-next=no-self-use + def _check_model(self, request: InferenceRequest) -> bool: + """Ensure that a model is available for the request + + :param request: The request to validate + :returns: False if model validation fails for the request, True otherwise + """ + if request.model_key or request.raw_model: + return True + + logger.error("Unable to continue without model bytes or feature store key") + return False + + # pylint: disable-next=no-self-use + def _check_inputs(self, request: InferenceRequest) -> bool: + """Ensure that inputs are available for the request + + :param request: The request to validate + :returns: False if input validation fails for the request, True otherwise + """ + if request.input_keys or request.raw_inputs: + return True + + logger.error("Unable to continue without input bytes or feature store keys") + return False + + # pylint: disable-next=no-self-use + def _check_callback(self, request: InferenceRequest) -> bool: + """Ensure that a callback channel is available for the request + + :param request: The request to validate + :returns: False if callback validation fails for the request, True otherwise + """ + if request.callback is not None: + return True + + logger.error("No callback channel provided in request") + return False + + def _validate_request(self, request: InferenceRequest) -> bool: + """Ensure the request can be processed + + :param request: The request to validate + :return: False if the request fails any validation checks, True otherwise""" + checks = [ + self._check_feature_stores(request), + self._check_model(request), + self._check_inputs(request), + self._check_callback(request), + ] + + return all(checks) + + def _on_iteration(self) -> None: + """This method is executed repeatedly until ``Service`` shutdown + conditions are satisfied and cooldown is elapsed. + """ + try: + self._perf_timer.set_active(True) + bytes_list: t.List[bytes] = self._incoming_channel.recv() + except Exception: + self._perf_timer.set_active(False) + else: + if not bytes_list: + exception_handler( + ValueError("No request data found"), + None, + "No request data found.", + ) + + request_bytes = bytes_list[0] + tensor_bytes_list = bytes_list[1:] + self._perf_timer.start_timings() + + request = self._worker.deserialize_message( + request_bytes, self._callback_factory + ) + if request.input_meta and tensor_bytes_list: + request.raw_inputs = tensor_bytes_list + + self._perf_timer.measure_time("deserialize_message") + + if not self._validate_request(request): + exception_handler( + ValueError("Error validating the request"), + request.callback, + "Error validating the request.", + ) + self._perf_timer.measure_time("validate_request") + else: + self._perf_timer.measure_time("validate_request") + self.dispatch(request) + self._perf_timer.measure_time("dispatch") + finally: + self.flush_requests() + self.remove_queues() + + self._perf_timer.end_timings() + + if self._perf_timer.max_length == 801 and self._perf_timer.is_active: + self._perf_timer.print_timings(True) + + def remove_queues(self) -> None: + """Remove references to queues that can be removed + and allow them to be garbage collected""" + queue_lists_to_remove = [] + for key, queues in self._queues.items(): + queues_to_remove = [] + for queue in queues: + if queue.can_be_removed: + queues_to_remove.append(queue) + + for queue_to_remove in queues_to_remove: + queues.remove(queue_to_remove) + if ( + key in self._active_queues + and self._active_queues[key] == queue_to_remove + ): + del self._active_queues[key] + + if len(queues) == 0: + queue_lists_to_remove.append(key) + + for key in queue_lists_to_remove: + del self._queues[key] + + @property + def task_queue(self) -> DragonQueue: + """The queue on which batched requests are placed""" + return self._outgoing_queue + + def _swap_queue(self, model_id: ModelIdentifier) -> None: + """Get an empty queue or create a new one + + and make it the active one for a given model. + :param model_id: The id of the model for which the + queue has to be swapped + """ + if model_id.key in self._queues: + for queue in self._queues[model_id.key]: + if not queue.full(): + self._active_queues[model_id.key] = queue + return + + new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_id) + if model_id.key in self._queues: + self._queues[model_id.key].append(new_queue) + else: + self._queues[model_id.key] = [new_queue] + self._active_queues[model_id.key] = new_queue + return + + def dispatch(self, request: InferenceRequest) -> None: + """Assign a request to a batch queue + :param request: the request to place + """ + if request.raw_model is not None: + logger.debug("Direct inference requested, creating tmp queue") + tmp_id = f"_tmp_{str(uuid.uuid4())}" + tmp_queue: BatchQueue = BatchQueue( + batch_timeout=0, + batch_size=1, + model_id=ModelIdentifier(key=tmp_id, descriptor="TMP"), + ) + self._active_queues[tmp_id] = tmp_queue + self._queues[tmp_id] = [tmp_queue] + tmp_queue.put_nowait(request) + tmp_queue.make_disposable() + return + + if request.model_key: + success = False + while not success: + try: + self._active_queues[request.model_key.key].put_nowait(request) + success = True + except (Full, KeyError): + self._swap_queue(request.model_key) + + def flush_requests(self) -> None: + """Get all requests from queues which are ready to be flushed. Place all + avaliable request batches in the outgoing queue. + """ + for queue_list in self._queues.values(): + for queue in queue_list: + if queue.ready: + self._perf_timer.measure_time("find_queue") + try: + batch = RequestBatch( + requests=queue.flush(), + inputs=None, + model_id=queue.model_id, + ) + finally: + self._perf_timer.measure_time("flush_requests") + try: + fetch_results = self._worker.fetch_inputs( + batch=batch, feature_stores=self._feature_stores + ) + except Exception as exc: + exception_handler( + exc, + None, + "Error fetching input.", + ) + continue + self._perf_timer.measure_time("fetch_input") + try: + transformed_inputs = self._worker.transform_input( + batch=batch, + fetch_results=fetch_results, + mem_pool=self._mem_pool, + ) + except Exception as exc: + exception_handler( + exc, + None, + "Error Transforming input.", + ) + continue + + self._perf_timer.measure_time("transform_input") + batch.inputs = transformed_inputs + for request in batch.requests: + request.raw_inputs = [] + request.input_meta = [] + + try: + self._outgoing_queue.put(batch) + except Exception as exc: + exception_handler( + exc, + None, + "Error placing batch on task queue.", + ) + continue + self._perf_timer.measure_time("put") + + def _can_shutdown(self) -> bool: + """Whether the Service can be shut down""" + return False + + def __del__(self) -> None: + self._mem_pool.destroy() diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index dcc35ae831..54a245b813 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -24,67 +24,42 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# pylint: disable=import-error +# pylint: disable-next=unused-import +import dragon + +# pylint: enable=import-error + +# isort: off +# isort: on + +import multiprocessing as mp import time import typing as t +from queue import Empty from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore from .....log import get_logger from ....entrypoints.service import Service -from ...comm.channel.channel import CommChannelBase -from ...comm.channel.dragonchannel import DragonCommChannel +from ....utils.timings import PerfTimer from ...infrastructure.environmentloader import EnvironmentConfigLoader from ...infrastructure.worker.worker import ( InferenceReply, - InferenceRequest, LoadModelResult, MachineLearningWorkerBase, + RequestBatch, ) from ...message_handler import MessageHandler -from ...mli_schemas.response.response_capnp import ResponseBuilder +from .devicemanager import DeviceManager, WorkerDevice +from .error_handling import build_failure_reply, exception_handler if t.TYPE_CHECKING: - from dragon.fli import FLInterface - from smartsim._core.mli.mli_schemas.response.response_capnp import Status logger = get_logger(__name__) -def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: - """Build a response indicating a failure occurred - :param status: The status of the response - :param message: The error message to include in the response""" - return MessageHandler.build_response( - status=status, - message=message, - result=None, - custom_attributes=None, - ) - - -def exception_handler( - exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str -) -> None: - """ - Logs exceptions and sends a failure response. - - :param exc: The exception to be logged - :param reply_channel: The channel used to send replies - :param failure_message: Failure message to log and send back - """ - logger.exception( - f"{failure_message}\n" - f"Exception type: {type(exc).__name__}\n" - f"Exception message: {str(exc)}" - ) - serialized_resp = MessageHandler.serialize_response( - build_failure_reply("fail", failure_message) - ) - if reply_channel: - reply_channel.send(serialized_resp) - - class WorkerManager(Service): """An implementation of a service managing distribution of tasks to machine learning workers""" @@ -92,26 +67,29 @@ class WorkerManager(Service): def __init__( self, config_loader: EnvironmentConfigLoader, - worker: MachineLearningWorkerBase, + worker_type: t.Type[MachineLearningWorkerBase], + dispatcher_queue: "mp.Queue[RequestBatch]", as_service: bool = False, cooldown: int = 0, device: t.Literal["cpu", "gpu"] = "cpu", ) -> None: """Initialize the WorkerManager - :param config_loader: Environment config loader that loads the task queue and - feature store - :param workers: A worker to manage + :param config_loader: Environment config loader for loading queues + and feature stores + :param worker_type: The type of worker to manage + :param dispatcher_queue: Queue from which the batched requests are pulled :param as_service: Specifies run-once or run-until-complete behavior of service :param cooldown: Number of seconds to wait before shutting down after shutdown criteria are met - :param device: The type of hardware the workers must be executed on + :param device: The device on which the Worker should run. Every worker manager + is assigned one single GPU (if available), thus the device should have no index. """ super().__init__(as_service, cooldown) - self._task_queue: t.Optional[CommChannelBase] = config_loader.get_queue() - """the queue the manager monitors for new tasks""" - self._worker = worker + self._dispatcher_queue = dispatcher_queue + """The Dispatcher queue that the WorkerManager monitors for new batches""" + self._worker = worker_type() """The ML Worker implementation""" self._callback_factory = config_loader._callback_factory """The type of communication channel to construct for callbacks""" @@ -126,19 +104,28 @@ def __init__( self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone() """A standalone, system-created feature store used to share internal information among MLI components""" + self._device_manager: t.Optional[DeviceManager] = None + """Object responsible for model caching and device access""" + self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) + """Performance timer""" - def _check_feature_stores(self, request: InferenceRequest) -> bool: + def _on_start(self) -> None: + """Called on initial entry into Service `execute` event loop before + `_on_iteration` is invoked.""" + self._device_manager = DeviceManager(WorkerDevice(self._device)) + + def _check_feature_stores(self, batch: RequestBatch) -> bool: """Ensures that all feature stores required by the request are available - :param request: The request to validate - :returns: False if feature store validation fails for the request, True otherwise + :param batch: The batch of requests to validate + :returns: False if feature store validation fails for the batch, True otherwise """ # collect all feature stores required by the request fs_model: t.Set[str] = set() - if request.model_key: - fs_model = {request.model_key.descriptor} - fs_inputs = {key.descriptor for key in request.input_keys} - fs_outputs = {key.descriptor for key in request.output_keys} + if batch.model_id.key: + fs_model = {batch.model_id.descriptor} + fs_inputs = {key.descriptor for key in batch.input_keys} + fs_outputs = {key.descriptor for key in batch.output_keys} # identify which feature stores are requested and unknown fs_desired = fs_model.union(fs_inputs).union(fs_outputs) @@ -158,269 +145,169 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool: return True - def _check_model(self, request: InferenceRequest) -> bool: - """Ensure that a model is available for the request - - :param request: The request to validate - :returns: False if model validation fails for the request, True otherwise - """ - if request.model_key or request.raw_model: - return True - - logger.error("Unable to continue without model bytes or feature store key") - return False - - def _check_inputs(self, request: InferenceRequest) -> bool: - """Ensure that inputs are available for the request - - :param request: The request to validate - :returns: False if input validation fails for the request, True otherwise - """ - if request.input_keys or request.raw_inputs: - return True - - logger.error("Unable to continue without input bytes or feature store keys") - return False - - def _check_callback(self, request: InferenceRequest) -> bool: - """Ensure that a callback channel is available for the request - - :param request: The request to validate - :returns: False if callback validation fails for the request, True otherwise - """ - if request.callback is not None: - return True - - logger.error("No callback channel provided in request") - return False - - def _validate_request(self, request: InferenceRequest) -> bool: + def _validate_batch(self, batch: RequestBatch) -> bool: """Ensure the request can be processed - :param request: The request to validate + :param batch: The batch of requests to validate :return: False if the request fails any validation checks, True otherwise""" - checks = [ - self._check_feature_stores(request), - self._check_model(request), - self._check_inputs(request), - self._check_callback(request), - ] - return all(checks) + if batch is None or len(batch.requests) == 0: + return False + + return self._check_feature_stores(batch) + # remove this when we are done with time measurements + # pylint: disable-next=too-many-statements def _on_iteration(self) -> None: """Executes calls to the machine learning worker implementation to complete the inference pipeline""" - logger.debug("executing worker manager pipeline") - if self._task_queue is None: - logger.error("No queue to check for tasks") + pre_batch_time = time.perf_counter() + try: + batch: RequestBatch = self._dispatcher_queue.get(timeout=0.0001) + except Empty: return - timings = [] # timing - - bytes_list: t.List[bytes] = self._task_queue.recv() + self._perf_timer.start_timings( + "flush_requests", time.perf_counter() - pre_batch_time + ) - if not bytes_list: + if not self._validate_batch(batch): exception_handler( - ValueError("No request data found"), + ValueError("An invalid batch was received"), None, - "No request data found.", + "Error batching inputs, the batch was invalid.", ) return - request_bytes = bytes_list[0] - tensor_bytes_list = bytes_list[1:] - - interm = time.perf_counter() # timing - request = self._worker.deserialize_message( - request_bytes, self._callback_factory - ) - - if request.input_meta and tensor_bytes_list: - request.raw_inputs = tensor_bytes_list + if self._device_manager is None: + for request in batch.requests: + msg = "No Device Manager found. WorkerManager._on_start() " + "must be called after initialization. If possible, " + "you should use `WorkerManager.execute()` instead of " + "directly calling `_on_iteration()`." + try: + self._dispatcher_queue.put(batch) + except Exception: + msg += "\nThe batch could not be put back in the queue " + "and will not be processed." + exception_handler( + RuntimeError(msg), + request.callback, + "Error acquiring device manager", + ) + return - if not self._validate_request(request): - exception_handler( - ValueError("Error validating the request"), - request.callback, - "Error validating the request.", + try: + device_cm = self._device_manager.get_device( + worker=self._worker, + batch=batch, + feature_stores=self._feature_stores, ) - - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - - reply = InferenceReply() - - if not request.raw_model: - if request.model_key is None: + except Exception as exc: + for request in batch.requests: exception_handler( - ValueError("Could not find model key or model"), + exc, request.callback, - "Could not find model key or model.", + "Error loading model on device or getting device.", ) - return + return + self._perf_timer.measure_time("fetch_model") - if request.model_key.key in self._cached_models: - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - model_result = LoadModelResult( - self._cached_models[request.model_key.key] - ) + with device_cm as device: - else: - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - try: - fetch_model_result = self._worker.fetch_model( - request, self._feature_stores - ) - except Exception as e: + try: + model_result = LoadModelResult(device.get_model(batch.model_id.key)) + except Exception as exc: + for request in batch.requests: exception_handler( - e, request.callback, "Failed while fetching the model." + exc, request.callback, "Error getting model from device." ) - return + return + self._perf_timer.measure_time("load_model") - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - try: - model_result = self._worker.load_model( - request, - fetch_result=fetch_model_result, - device=self._device, - ) - self._cached_models[request.model_key.key] = model_result.model - except Exception as e: + if batch.inputs is None: + for request in batch.requests: exception_handler( - e, + ValueError("Error batching inputs"), request.callback, - "Failed while loading model from feature store.", + "Error batching inputs.", ) - return + return + transformed_input = batch.inputs - else: - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing try: - fetch_model_result = self._worker.fetch_model( - request, self._feature_stores + execute_result = self._worker.execute( + batch, model_result, transformed_input, device.name ) except Exception as e: - exception_handler( - e, request.callback, "Failed while fetching the model." - ) + for request in batch.requests: + exception_handler(e, request.callback, "Failed while executing.") return + self._perf_timer.measure_time("execute") - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing try: - model_result = self._worker.load_model( - request, fetch_result=fetch_model_result, device=self._device + transformed_outputs = self._worker.transform_output( + batch, execute_result ) except Exception as e: - exception_handler( - e, - request.callback, - "Failed while loading model from feature store.", - ) + for request in batch.requests: + exception_handler( + e, request.callback, "Failed while transforming the output." + ) return - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - try: - fetch_input_result = self._worker.fetch_inputs( - request, self._feature_stores - ) - except Exception as e: - exception_handler(e, request.callback, "Failed while fetching the inputs.") - return - - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - try: - transformed_input = self._worker.transform_input( - request, fetch_input_result, self._device - ) - except Exception as e: - exception_handler( - e, request.callback, "Failed while transforming the input." - ) - return - - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - try: - execute_result = self._worker.execute( - request, model_result, transformed_input - ) - except Exception as e: - exception_handler(e, request.callback, "Failed while executing.") - return - - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - try: - transformed_output = self._worker.transform_output( - request, execute_result, self._device - ) - except Exception as e: - exception_handler( - e, request.callback, "Failed while transforming the output." - ) - return + for request, transformed_output in zip(batch.requests, transformed_outputs): + reply = InferenceReply() + if request.output_keys: + try: + reply.output_keys = self._worker.place_output( + request, + transformed_output, + self._feature_stores, + ) + except Exception as e: + exception_handler( + e, request.callback, "Failed while placing the output." + ) + continue + else: + reply.outputs = transformed_output.outputs + self._perf_timer.measure_time("assign_output") + + if reply.outputs is None or not reply.outputs: + response = build_failure_reply("fail", "Outputs not found.") + else: + reply.status_enum = "complete" + reply.message = "Success" + + results = self._worker.prepare_outputs(reply) + response = MessageHandler.build_response( + status=reply.status_enum, + message=reply.message, + result=results, + custom_attributes=None, + ) - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - if request.output_keys: - try: - reply.output_keys = self._worker.place_output( - request, transformed_output, self._feature_stores - ) - except Exception as e: - exception_handler( - e, request.callback, "Failed while placing the output." - ) - return - else: - reply.outputs = transformed_output.outputs - - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - - if reply.outputs is None or not reply.outputs: - response = build_failure_reply("fail", "Outputs not found.") - else: - reply.status_enum = "complete" - reply.message = "Success" - - results = self._worker.prepare_outputs(reply) - response = MessageHandler.build_response( - status=reply.status_enum, - message=reply.message, - result=results, - custom_attributes=None, - ) + self._perf_timer.measure_time("build_reply") - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing + serialized_resp = MessageHandler.serialize_response(response) - serialized_resp = MessageHandler.serialize_response(response) + self._perf_timer.measure_time("serialize_resp") - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - if request.callback: - # send serialized response - request.callback.send(serialized_resp) - if reply.outputs: - # send tensor data after response - for output in reply.outputs: - request.callback.send(output) + if request.callback: + request.callback.send(serialized_resp) + if reply.outputs: + # send tensor data after response + for output in reply.outputs: + request.callback.send(output) + self._perf_timer.measure_time("send") - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing + self._perf_timer.end_timings() - print(" ".join(str(time) for time in timings)) # timing + if self._perf_timer.max_length == 801: + self._perf_timer.print_timings(True) def _can_shutdown(self) -> bool: """Return true when the criteria to shut down the service are met.""" diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index eea349894c..0639d59696 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -29,6 +29,9 @@ import numpy as np import torch +# pylint: disable=import-error +from dragon.managed_memory import MemoryAlloc, MemoryPool + from .....error import SmartSimError from .....log import get_logger from ...mli_schemas.tensor import tensor_capnp @@ -36,13 +39,18 @@ ExecuteResult, FetchInputResult, FetchModelResult, - InferenceRequest, LoadModelResult, MachineLearningWorkerBase, + RequestBatch, TransformInputResult, TransformOutputResult, ) +# pylint: enable=import-error + + +torch.set_num_threads(1) +torch.set_num_interop_threads(4) logger = get_logger(__name__) @@ -51,75 +59,150 @@ class TorchWorker(MachineLearningWorkerBase): @staticmethod def load_model( - request: InferenceRequest, fetch_result: FetchModelResult, device: str + batch: RequestBatch, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: if fetch_result.model_bytes: model_bytes = fetch_result.model_bytes - elif request.raw_model and request.raw_model.data: - model_bytes = request.raw_model.data + elif batch.raw_model and batch.raw_model.data: + model_bytes = batch.raw_model.data else: raise ValueError("Unable to load model without reference object") device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = device_to_torch[device] + for old, new in device_to_torch.items(): + device = device.replace(old, new) + buffer = io.BytesIO(initial_bytes=model_bytes) - model = torch.jit.load(buffer, map_location=device) # type: ignore + with torch.no_grad(): + model = torch.jit.load(buffer, map_location=device) # type: ignore + model.eval() result = LoadModelResult(model) return result @staticmethod def transform_input( - request: InferenceRequest, fetch_result: FetchInputResult, device: str + batch: RequestBatch, + fetch_results: list[FetchInputResult], + mem_pool: MemoryPool, ) -> TransformInputResult: - result = [] + results: list[torch.Tensor] = [] + total_samples = 0 + slices: list[slice] = [] - device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = device_to_torch[device] - if fetch_result.meta is None: + all_dims: list[list[int]] = [] + all_dtypes: list[str] = [] + if fetch_results[0].meta is None: raise ValueError("Cannot reconstruct tensor without meta information") - for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): - tensor_desc: tensor_capnp.TensorDescriptor = item_meta - result.append( - torch.from_numpy(np.frombuffer(item, dtype=str(tensor_desc.dataType))) - .to(device) - .reshape(tuple(dim for dim in tensor_desc.dimensions)) + # Traverse inputs to get total number of samples and compute slices + # Assumption: first dimension is samples, all tensors in the same input + # have same number of samples + # thus we only look at the first tensor for each input + for res_idx, fetch_result in enumerate(fetch_results): + if fetch_result.meta is None or any( + item_meta is None for item_meta in fetch_result.meta + ): + raise ValueError("Cannot reconstruct tensor without meta information") + first_tensor_desc: tensor_capnp.TensorDescriptor = fetch_result.meta[0] + num_samples = first_tensor_desc.dimensions[0] + slices.append(slice(total_samples, total_samples + num_samples)) + total_samples = total_samples + num_samples + + if res_idx == len(fetch_results) - 1: + # For each tensor in the last input, get remaining dimensions + # Assumptions: all inputs have the same number of tensors and + # last N-1 dimensions match across inputs for corresponding tensors + # thus: resulting array will be of size (num_samples, all_other_dims) + for item_meta in fetch_result.meta: + tensor_desc: tensor_capnp.TensorDescriptor = item_meta + tensor_dims = list(tensor_desc.dimensions) + all_dims.append([total_samples, *tensor_dims[1:]]) + all_dtypes.append(str(tensor_desc.dataType)) + + for result_tensor_idx, (dims, dtype) in enumerate(zip(all_dims, all_dtypes)): + itemsize = np.empty((1), dtype=dtype).itemsize + alloc_size = int(np.prod(dims) * itemsize) + mem_alloc = mem_pool.alloc(alloc_size) + mem_view = mem_alloc.get_memview() + mem_view[:alloc_size] = b"".join( + [ + fetch_result.inputs[result_tensor_idx] + for fetch_result in fetch_results + ] ) - return TransformInputResult(result) - # return data # note: this fails copy test! + results.append(mem_alloc.serialize()) + + return TransformInputResult(results, slices, all_dims, all_dtypes) + + # pylint: disable-next=unused-argument @staticmethod def execute( - request: InferenceRequest, + batch: RequestBatch, load_result: LoadModelResult, transform_result: TransformInputResult, + device: str, ) -> ExecuteResult: if not load_result.model: raise SmartSimError("Model must be loaded to execute") + device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + for old, new in device_to_torch.items(): + device = device.replace(old, new) + + tensors = [] + mem_allocs = [] + for transformed, dims, dtype in zip( + transform_result.transformed, transform_result.dims, transform_result.dtypes + ): + mem_alloc = MemoryAlloc.attach(transformed) + mem_allocs.append(mem_alloc) + itemsize = np.empty((1), dtype=dtype).itemsize + tensors.append( + torch.from_numpy( + np.frombuffer( + mem_alloc.get_memview()[0 : np.prod(dims) * itemsize], + dtype=dtype, + ).reshape(dims) + ) + ) model: torch.nn.Module = load_result.model - model.eval() - results = [model(tensor).detach() for tensor in transform_result.transformed] + with torch.no_grad(): + model.eval() + results = [ + model( + *[ + tensor.to(device, non_blocking=True).detach() + for tensor in tensors + ] + ) + ] + + transform_result.transformed = [] - execute_result = ExecuteResult(results) + execute_result = ExecuteResult(results, transform_result.slices) + for mem_alloc in mem_allocs: + mem_alloc.free() return execute_result @staticmethod def transform_output( - request: InferenceRequest, + batch: RequestBatch, execute_result: ExecuteResult, - result_device: str, - ) -> TransformOutputResult: - if result_device != "cpu": - transformed = [ - item.to("cpu").numpy().tobytes() for item in execute_result.predictions - ] - - # todo: need the shape from latest schemas added here. - return TransformOutputResult(transformed, None, "c", "float32") # fixme - - return TransformOutputResult( - [item.numpy().tobytes() for item in execute_result.predictions], - None, - "c", - "float32", - ) # fixme + ) -> list[TransformOutputResult]: + transformed_list: list[TransformOutputResult] = [] + cpu_predictions = [ + prediction.cpu() for prediction in execute_result.predictions + ] + for result_slice in execute_result.slices: + transformed = [] + for cpu_item in cpu_predictions: + transformed.append(cpu_item[result_slice].numpy().tobytes()) + + # todo: need the shape from latest schemas added here. + transformed_list.append( + TransformOutputResult(transformed, None, "c", "float32") + ) # fixme + + execute_result.predictions = [] + + return transformed_list diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 89fb635247..25e4dc49f7 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -24,8 +24,15 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# pylint: disable=import-error +from dragon.managed_memory import MemoryPool + +# isort: off +# isort: on + import typing as t from abc import ABC, abstractmethod +from dataclasses import dataclass from .....error import SmartSimError from .....log import get_logger @@ -40,6 +47,9 @@ logger = get_logger(__name__) +# Placeholder +ModelIdentifier = FeatureStoreKey + class InferenceRequest: """Internal representation of an inference request from a client""" @@ -100,19 +110,34 @@ def __init__(self, model: t.Any) -> None: class TransformInputResult: - """A wrapper around a transformed input""" + """A wrapper around a transformed batch of input tensors""" - def __init__(self, result: t.Any) -> None: + def __init__( + self, + result: t.Any, + slices: list[slice], + dims: list[list[int]], + dtypes: list[str], + ) -> None: """Initialize the object""" self.transformed = result + """List of Dragon MemoryAlloc objects on which the tensors are stored""" + self.slices = slices + """Each slice represents which portion of the input tensors belongs to + which request""" + self.dims = dims + """Dimension of the transformed tensors""" + self.dtypes = dtypes + """Data type of transformed tensors""" class ExecuteResult: """A wrapper around inference results""" - def __init__(self, result: t.Any) -> None: + def __init__(self, result: t.Any, slices: list[slice]) -> None: """Initialize the object""" self.predictions = result + self.slices = slices class FetchInputResult: @@ -153,6 +178,62 @@ def __init__(self, result: bytes) -> None: self.model_bytes: bytes = result +@dataclass +class RequestBatch: + """A batch of aggregated inference requests""" + + requests: list[InferenceRequest] + inputs: t.Optional[TransformInputResult] + model_id: ModelIdentifier + + @property + def has_valid_requests(self) -> bool: + """Returns whether the batch contains at least one request. + + :return: True if at least one request is available + """ + return len(self.requests) > 0 + + @property + def has_raw_model(self) -> bool: + """Returns whether the batch has a raw model + + :return: True if the batch has a raw model + """ + return self.raw_model is not None + + @property + def raw_model(self) -> t.Optional[t.Any]: + """Returns the raw model to use to execute for this batch + if it is available. + :return: A model if available, otherwise None""" + if self.has_valid_requests: + return self.requests[0].raw_model + return None + + @property + def input_keys(self) -> t.List[FeatureStoreKey]: + """All input keys available in this batch's requests + + :return: All input keys belonging to requests in this batch""" + keys = [] + for request in self.requests: + keys.extend(request.input_keys) + + return keys + + @property + def output_keys(self) -> t.List[FeatureStoreKey]: + """All output keys available in this batch's requests + + :return: All output keys belonging to requests in this batch""" + keys = [] + for request in self.requests: + keys.extend(request.output_keys) + + return keys + + class MachineLearningWorkerCore: """Basic functionality of ML worker that is shared across all worker types""" @@ -233,29 +314,30 @@ def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: @staticmethod def fetch_model( - request: InferenceRequest, feature_stores: t.Dict[str, FeatureStore] + batch: RequestBatch, feature_stores: t.Dict[str, FeatureStore] ) -> FetchModelResult: """Given a resource key, retrieve the raw model from a feature store - :param request: The request that triggered the pipeline + :param batch: The batch of requests that triggered the pipeline :param feature_stores: Available feature stores used for persistence - :return: Raw bytes of the model""" + :return: Raw bytes of the model + :raises SmartSimError: if neither a key or a model are provided or the + model cannot be retrieved from the feature store + :raises ValueError: if a feature store is not available and a raw + model is not provided""" - if request.raw_model: - # Should we cache model in the feature store? - # model_key = hash(request.raw_model) - # feature_store[model_key] = request.raw_model - # short-circuit and return the directly supplied model - return FetchModelResult(request.raw_model.data) + # All requests in the same batch share the model + if batch.raw_model: + return FetchModelResult(batch.raw_model.data) if not feature_stores: raise ValueError("Feature store is required for model retrieval") - if not request.model_key: + if batch.model_id is None: raise SmartSimError( "Key must be provided to retrieve model from feature store" ) - key, fsd = request.model_key.key, request.model_key.descriptor + key, fsd = batch.model_id.key, batch.model_id.descriptor try: feature_store = feature_stores[fsd] @@ -267,51 +349,47 @@ def fetch_model( @staticmethod def fetch_inputs( - request: InferenceRequest, feature_stores: t.Dict[str, FeatureStore] - ) -> FetchInputResult: + batch: RequestBatch, feature_stores: t.Dict[str, FeatureStore] + ) -> t.List[FetchInputResult]: """Given a collection of ResourceKeys, identify the physical location and input metadata - :param request: The request that triggered the pipeline + :param batch: The batch of requests that triggered the pipeline :param feature_stores: Available feature stores used for persistence - :return: the fetched input""" + :return: the fetched input + :raises ValueError: If neither an input key or an input tensor are provided + :raises SmartSimError: If a tensor for a given key cannot be retrieved""" + fetch_results = [] + for request in batch.requests: + if request.raw_inputs: + fetch_results.append( + FetchInputResult(request.raw_inputs, request.input_meta) + ) + continue - if request.raw_inputs: - return FetchInputResult(request.raw_inputs, request.input_meta) + if not feature_stores: + raise ValueError("No input and no feature store provided") - if not feature_stores: - raise ValueError("No input and no feature store provided") - - if request.input_keys: - data: t.List[bytes] = [] - - for fs_key in request.input_keys: - try: - feature_store = feature_stores[fs_key.descriptor] - tensor_bytes = t.cast(bytes, feature_store[fs_key.key]) - data.append(tensor_bytes) - except KeyError as ex: - logger.exception(ex) - raise SmartSimError( - f"Model could not be retrieved with key {fs_key.key}" - ) from ex - return FetchInputResult( - data, meta=None - ) # fixme: need to get both tensor and descriptor - - raise ValueError("No input source") + if request.input_keys: + data: t.List[bytes] = [] - @staticmethod - def batch_requests( - request: InferenceRequest, transform_result: TransformInputResult - ) -> CreateInputBatchResult: - """Create a batch of requests. Return the batch when batch_size datum have been - collected or a configured batch duration has elapsed. - :param request: The request that triggered the pipeline - :param transform_result: Transformed inputs ready for batching - :return: `None` if batch size has not been reached and timeout not exceeded.""" - if transform_result is not None or request.batch_size: - raise NotImplementedError("Batching is not yet supported") - return CreateInputBatchResult(None) + for fs_key in request.input_keys: + try: + feature_store = feature_stores[fs_key.descriptor] + tensor_bytes = t.cast(bytes, feature_store[fs_key.key]) + data.append(tensor_bytes) + except KeyError as ex: + logger.exception(ex) + raise SmartSimError( + f"Tensor could not be retrieved with key {fs_key.key}" + ) from ex + fetch_results.append( + FetchInputResult(data, meta=None) + ) # fixme: need to get both tensor and descriptor + continue + + raise ValueError("No input source") + + return fetch_results @staticmethod def place_output( @@ -324,7 +402,9 @@ def place_output( :param request: The request that triggered the pipeline :param execute_result: Results from inference :param feature_stores: Available feature stores used for persistence - :return: A collection of keys that were placed in the feature store""" + :return: A collection of keys that were placed in the feature store + :raises ValueError: If a feature store is not provided + """ if not feature_stores: raise ValueError("Feature store is required for output persistence") @@ -342,13 +422,13 @@ def place_output( class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC): - """Abstrct base class providing contract for a machine learning + """Abstract base class providing contract for a machine learning worker implementation.""" @staticmethod @abstractmethod def load_model( - request: InferenceRequest, fetch_result: FetchModelResult, device: str + batch: RequestBatch, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: """Given a loaded MachineLearningModel, ensure it is loaded into device memory @@ -359,35 +439,39 @@ def load_model( @staticmethod @abstractmethod def transform_input( - request: InferenceRequest, fetch_result: FetchInputResult, device: str + batch: RequestBatch, + fetch_results: list[FetchInputResult], + mem_pool: MemoryPool, ) -> TransformInputResult: - """Given a collection of data, perform a transformation on the data + """Given a collection of data, perform a transformation on the data and put + the raw tensor data on a MemoryPool allocation. :param request: The request that triggered the pipeline - :param fetch_result: Raw output from fetching inputs out of a feature store - :param device: The device on which the transformed input must be placed + :param fetch_result: Raw outputs from fetching inputs out of a feature store + :param mem_pool: The memory pool used to access batched input tensors :return: The transformed inputs wrapped in a InputTransformResult""" @staticmethod @abstractmethod def execute( - request: InferenceRequest, + batch: RequestBatch, load_result: LoadModelResult, transform_result: TransformInputResult, + device: str, ) -> ExecuteResult: """Execute an ML model on inputs transformed for use by the model - :param request: The request that triggered the pipeline + :param batch: The batch of requests that triggered the pipeline :param load_result: The result of loading the model onto device memory :param transform_result: The result of transforming inputs for model consumption + :param device: The device on which the model will be executed :return: The result of inference wrapped in an ExecuteResult""" @staticmethod @abstractmethod def transform_output( - request: InferenceRequest, execute_result: ExecuteResult, result_device: str - ) -> TransformOutputResult: + batch: RequestBatch, execute_result: ExecuteResult + ) -> t.List[TransformOutputResult]: """Given inference results, perform transformations required to transmit results to the requestor. - :param request: The request that triggered the pipeline + :param batch: The batch of requests that triggered the pipeline :param execute_result: The result of inference wrapped in an ExecuteResult - :param result_device: The device on which the result of inference is placed - :return:""" + :return: A list of transformed outputs""" diff --git a/smartsim/_core/mli/mli_schemas/model/__init__.py b/smartsim/_core/mli/mli_schemas/model/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py new file mode 100644 index 0000000000..a61a243220 --- /dev/null +++ b/smartsim/_core/utils/timings.py @@ -0,0 +1,143 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time +import typing as t +from collections import OrderedDict + +import numpy as np + +from ...log import get_logger + +logger = get_logger("PerfTimer") + + +class PerfTimer: + def __init__( + self, + filename: str = "timings", + prefix: str = "", + timing_on: bool = True, + debug: bool = False, + ): + self._start: t.Optional[float] = None + self._interm: t.Optional[float] = None + self._timings: OrderedDict[str, list[t.Union[float, int, str]]] = OrderedDict() + self._timing_on = timing_on + self._filename = filename + self._prefix = prefix + self._debug = debug + + def _add_label_to_timings(self, label: str) -> None: + if label not in self._timings: + self._timings[label] = [] + + @staticmethod + def _format_number(number: t.Union[float, int]) -> str: + return f"{number:0.4e}" + + def start_timings( + self, + first_label: t.Optional[str] = None, + first_value: t.Optional[t.Union[float, int]] = None, + ) -> None: + if self._timing_on: + if first_label is not None and first_value is not None: + mod_label = self._make_label(first_label) + value = self._format_number(first_value) + self._log(f"Started timing: {first_label}: {value}") + self._add_label_to_timings(mod_label) + self._timings[mod_label].append(value) + self._start = time.perf_counter() + self._interm = time.perf_counter() + + def end_timings(self) -> None: + if self._timing_on and self._start is not None: + mod_label = self._make_label("total_time") + self._add_label_to_timings(mod_label) + delta = self._format_number(time.perf_counter() - self._start) + self._timings[self._make_label("total_time")].append(delta) + self._log(f"Finished timing: {mod_label}: {delta}") + self._interm = None + + def _make_label(self, label: str) -> str: + return self._prefix + label + + def _get_delta(self) -> t.Union[float, int]: + if self._interm is None: + return 0 + return time.perf_counter() - self._interm + + def get_last(self, label: str) -> str: + mod_label = self._make_label(label) + if mod_label in self._timings: + value = self._timings[mod_label][-1] + if value: + return f"{label}: {value}" + + return "Not measured yet" + + def measure_time(self, label: str) -> None: + if self._timing_on and self._interm is not None: + mod_label = self._make_label(label) + self._add_label_to_timings(mod_label) + delta = self._format_number(self._get_delta()) + self._timings[mod_label].append(delta) + self._log(f"{mod_label}: {delta}") + self._interm = time.perf_counter() + + def _log(self, msg: str) -> None: + if self._debug: + logger.info(msg) + + @property + def max_length(self) -> int: + if len(self._timings) == 0: + return 0 + return max(len(value) for value in self._timings.values()) + + def print_timings(self, to_file: bool = False) -> None: + print(" ".join(self._timings.keys())) + try: + value_array = np.array(list(self._timings.values()), dtype=float) + except Exception as e: + logger.exception(e) + return + value_array = np.transpose(value_array) + if self._debug: + for i in range(value_array.shape[0]): + print(" ".join(self._format_number(value) for value in value_array[i])) + if to_file: + np.save(self._prefix + self._filename + ".npy", value_array) + + def set_active(self, active: bool = True) -> None: + """Set whether the timer will record time""" + self._timing_on = active + + @property + def is_active(self) -> bool: + """Returns true if the timer will record time""" + return self._timing_on diff --git a/tests/mli/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py similarity index 80% rename from tests/mli/test_core_machine_learning_worker.py rename to tests/dragon/test_core_machine_learning_worker.py index 7ef4ab259b..231a971241 100644 --- a/tests/mli/test_core_machine_learning_worker.py +++ b/tests/dragon/test_core_machine_learning_worker.py @@ -28,6 +28,9 @@ import time import pytest + +dragon = pytest.importorskip("dragon") + import torch import smartsim.error as sse @@ -35,6 +38,7 @@ from smartsim._core.mli.infrastructure.worker.worker import ( InferenceRequest, MachineLearningWorkerCore, + RequestBatch, TransformInputResult, TransformOutputResult, ) @@ -42,8 +46,8 @@ from .featurestore import FileSystemFeatureStore, MemoryFeatureStore -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_b +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon # retrieved from pytest fixtures is_dragon = ( @@ -94,9 +98,11 @@ def test_fetch_model_disk(persist_torch_model: pathlib.Path, test_dir: str) -> N fsd = feature_store.descriptor feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes() - request = InferenceRequest(model_key=FeatureStoreKey(key=key, descriptor=fsd)) + model_key = FeatureStoreKey(key=key, descriptor=fsd) + request = InferenceRequest(model_key=model_key) + batch = RequestBatch([request], None, model_key) - fetch_result = worker.fetch_model(request, {fsd: feature_store}) + fetch_result = worker.fetch_model(batch, {fsd: feature_store}) assert fetch_result.model_bytes assert fetch_result.model_bytes == persist_torch_model.read_bytes() @@ -110,10 +116,12 @@ def test_fetch_model_disk_missing() -> None: key = "/path/that/doesnt/exist" - request = InferenceRequest(model_key=FeatureStoreKey(key=key, descriptor=fsd)) + model_key = FeatureStoreKey(key=key, descriptor=fsd) + request = InferenceRequest(model_key=model_key) + batch = RequestBatch([request], None, model_key) with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_model(request, {fsd: feature_store}) + worker.fetch_model(batch, {fsd: feature_store}) # ensure the error message includes key-identifying information assert key in ex.value.args[0] @@ -133,10 +141,11 @@ def test_fetch_model_feature_store(persist_torch_model: pathlib.Path) -> None: fsd = feature_store.descriptor feature_store[key] = persist_torch_model.read_bytes() - request = InferenceRequest( - model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor) - ) - fetch_result = worker.fetch_model(request, {fsd: feature_store}) + model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + request = InferenceRequest(model_key=model_key) + batch = RequestBatch([request], None, model_key) + + fetch_result = worker.fetch_model(batch, {fsd: feature_store}) assert fetch_result.model_bytes assert fetch_result.model_bytes == persist_torch_model.read_bytes() @@ -150,13 +159,13 @@ def test_fetch_model_feature_store_missing() -> None: feature_store = MemoryFeatureStore() fsd = feature_store.descriptor - request = InferenceRequest( - model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor) - ) + model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + request = InferenceRequest(model_key=model_key) + batch = RequestBatch([request], None, model_key) # todo: consider that raising this exception shows impl. replace... with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_model(request, {fsd: feature_store}) + worker.fetch_model(batch, {fsd: feature_store}) # ensure the error message includes key-identifying information assert key in ex.value.args[0] @@ -173,11 +182,11 @@ def test_fetch_model_memory(persist_torch_model: pathlib.Path) -> None: fsd = feature_store.descriptor feature_store[key] = persist_torch_model.read_bytes() - request = InferenceRequest( - model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor) - ) + model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + request = InferenceRequest(model_key=model_key) + batch = RequestBatch([request], None, model_key) - fetch_result = worker.fetch_model(request, {fsd: feature_store}) + fetch_result = worker.fetch_model(batch, {fsd: feature_store}) assert fetch_result.model_bytes assert fetch_result.model_bytes == persist_torch_model.read_bytes() @@ -193,12 +202,16 @@ def test_fetch_input_disk(persist_torch_tensor: pathlib.Path) -> None: request = InferenceRequest( input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)] ) + + model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + batch = RequestBatch([request], None, model_key) + worker = MachineLearningWorkerCore feature_store[tensor_name] = persist_torch_tensor.read_bytes() - fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) - assert fetch_result.inputs is not None + fetch_result = worker.fetch_inputs(batch, {fsd: feature_store}) + assert fetch_result[0].inputs is not None def test_fetch_input_disk_missing() -> None: @@ -212,8 +225,11 @@ def test_fetch_input_disk_missing() -> None: request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) + model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + batch = RequestBatch([request], None, model_key) + with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_inputs(request, {fsd: feature_store}) + worker.fetch_inputs(batch, {fsd: feature_store}) # ensure the error message includes key-identifying information assert key[0] in ex.value.args[0] @@ -236,9 +252,14 @@ def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None: # put model bytes into the feature store feature_store[tensor_name] = persist_torch_tensor.read_bytes() - fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) - assert fetch_result.inputs - assert list(fetch_result.inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10] + model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + batch = RequestBatch([request], None, model_key) + + fetch_result = worker.fetch_inputs(batch, {fsd: feature_store}) + assert fetch_result[0].inputs + assert ( + list(fetch_result[0].inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10] + ) @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") @@ -269,9 +290,12 @@ def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) -> ] ) - fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) + model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + batch = RequestBatch([request], None, model_key) + + fetch_result = worker.fetch_inputs(batch, {fsd: feature_store}) - raw_bytes = list(fetch_result.inputs) + raw_bytes = list(fetch_result[0].inputs) assert raw_bytes assert raw_bytes[0][:10] == persist_torch_tensor.read_bytes()[:10] assert raw_bytes[1][:10] == body2[:10] @@ -288,8 +312,11 @@ def test_fetch_input_feature_store_missing() -> None: fsd = feature_store.descriptor request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) + model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + batch = RequestBatch([request], None, model_key) + with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_inputs(request, {fsd: feature_store}) + worker.fetch_inputs(batch, {fsd: feature_store}) # ensure the error message includes key-identifying information assert key in ex.value.args[0] @@ -307,21 +334,11 @@ def test_fetch_input_memory(persist_torch_tensor: pathlib.Path) -> None: feature_store[key] = persist_torch_tensor.read_bytes() request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) - fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) - assert fetch_result.inputs is not None - - -def test_batch_requests() -> None: - """Verify batch requests handles an empty data set gracefully""" - worker = MachineLearningWorkerCore - result = TransformInputResult([]) - - request = InferenceRequest(batch_size=10) + model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + batch = RequestBatch([request], None, model_key) - with pytest.raises(NotImplementedError): - # NOTE: we expect this to fail since it's not yet implemented. - # TODO: once implemented, replace this expectation of failure... - worker.batch_requests(request, result) + fetch_result = worker.fetch_inputs(batch, {fsd: feature_store}) + assert fetch_result[0].inputs is not None def test_place_outputs() -> None: diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py new file mode 100644 index 0000000000..8edeb60fbb --- /dev/null +++ b/tests/dragon/test_device_manager.py @@ -0,0 +1,185 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + +import pytest + +dragon = pytest.importorskip("dragon") + +from smartsim._core.mli.infrastructure.control.devicemanager import ( + DeviceManager, + WorkerDevice, +) +from smartsim._core.mli.infrastructure.storage.featurestore import ( + FeatureStore, + FeatureStoreKey, +) +from smartsim._core.mli.infrastructure.worker.worker import ( + ExecuteResult, + FetchInputResult, + FetchModelResult, + InferenceRequest, + LoadModelResult, + MachineLearningWorkerBase, + RequestBatch, + TransformInputResult, + TransformOutputResult, +) + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + + +class MockWorker(MachineLearningWorkerBase): + @staticmethod + def fetch_model( + batch: RequestBatch, feature_stores: t.Dict[str, FeatureStore] + ) -> FetchModelResult: + if batch.has_raw_model: + return FetchModelResult(batch.raw_model) + return FetchModelResult(b"fetched_model") + + @staticmethod + def load_model( + batch: RequestBatch, fetch_result: FetchModelResult, device: str + ) -> LoadModelResult: + return LoadModelResult(fetch_result.model_bytes) + + @staticmethod + def transform_input( + batch: RequestBatch, + fetch_results: list[FetchInputResult], + mem_pool: "MemoryPool", + ) -> TransformInputResult: + return TransformInputResult(b"result", [slice(0, 1)], [[1, 2]], ["float32"]) + + @staticmethod + def execute( + batch: RequestBatch, + load_result: LoadModelResult, + transform_result: TransformInputResult, + device: str, + ) -> ExecuteResult: + return ExecuteResult(b"result", [slice(0, 1)]) + + @staticmethod + def transform_output( + batch: RequestBatch, execute_result: ExecuteResult + ) -> t.List[TransformOutputResult]: + return [TransformOutputResult(b"result", None, "c", "float32")] + + +def test_worker_device(): + worker_device = WorkerDevice("gpu:0") + assert worker_device.name == "gpu:0" + + model_key = "my_model_key" + model = b"the model" + + worker_device.add_model(model_key, model) + + assert model_key in worker_device + assert worker_device.get_model(model_key) == model + worker_device.remove_model(model_key) + + assert model_key not in worker_device + + +def test_device_manager_model_in_request(): + + worker_device = WorkerDevice("gpu:0") + device_manager = DeviceManager(worker_device) + + worker = MockWorker() + + tensor_key = FeatureStoreKey(key="key", descriptor="desc") + output_key = FeatureStoreKey(key="key", descriptor="desc") + model_key = FeatureStoreKey(key="model key", descriptor="desc") + + request = InferenceRequest( + model_key=model_key, + callback=None, + raw_inputs=None, + input_keys=[tensor_key], + input_meta=None, + output_keys=[output_key], + raw_model=b"raw model", + batch_size=0, + ) + + request_batch = RequestBatch( + [request], + TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]), + model_id=model_key, + ) + + with device_manager.get_device( + worker=worker, batch=request_batch, feature_stores={} + ) as returned_device: + + assert returned_device == worker_device + assert worker_device.get_model(model_key.key) == b"raw model" + + assert model_key.key not in worker_device + + +def test_device_manager_model_key(): + + worker_device = WorkerDevice("gpu:0") + device_manager = DeviceManager(worker_device) + + worker = MockWorker() + + tensor_key = FeatureStoreKey(key="key", descriptor="desc") + output_key = FeatureStoreKey(key="key", descriptor="desc") + model_key = FeatureStoreKey(key="model key", descriptor="desc") + + request = InferenceRequest( + model_key=model_key, + callback=None, + raw_inputs=None, + input_keys=[tensor_key], + input_meta=None, + output_keys=[output_key], + raw_model=None, + batch_size=0, + ) + + request_batch = RequestBatch( + [request], + TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]), + model_id=model_key, + ) + + with device_manager.get_device( + worker=worker, batch=request_batch, feature_stores={} + ) as returned_device: + + assert returned_device == worker_device + assert worker_device.get_model(model_key.key) == b"fetched_model" + + assert model_key.key in worker_device diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 5603269b2f..b20424866a 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -30,12 +30,19 @@ dragon = pytest.importorskip("dragon") +import multiprocessing as mp + import dragon.utils as du from dragon.channels import Channel from dragon.data.ddict.ddict import DDict from dragon.fli import FLInterface +from dragon.mpbridge.queues import DragonQueue from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel +from smartsim._core.mli.infrastructure.control.devicemanager import WorkerDevice +from smartsim._core.mli.infrastructure.control.requestdispatcher import ( + RequestDispatcher, +) from smartsim._core.mli.infrastructure.control.workermanager import ( WorkerManager, exception_handler, @@ -44,13 +51,18 @@ from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.infrastructure.storage.featurestore import ( + FeatureStore, + FeatureStoreKey, +) from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, FetchInputResult, FetchModelResult, InferenceReply, + InferenceRequest, LoadModelResult, + RequestBatch, TransformInputResult, TransformOutputResult, ) @@ -85,7 +97,7 @@ def setup_worker_manager_model_bytes( backbone_descriptor: str, app_feature_store: FeatureStore, ): - integrated_worker = IntegratedTorchWorker() + integrated_worker_type = IntegratedTorchWorker chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) @@ -95,17 +107,136 @@ def setup_worker_manager_model_bytes( # Put backbone descriptor into env var for the `EnvironmentConfigLoader` monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) + + dispatcher_task_queue = mp.Queue(maxsize=0) + worker_manager = WorkerManager( - EnvironmentConfigLoader( - featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_descriptor, - ), - integrated_worker, + config_loader=config_loader, + worker_type=integrated_worker_type, + dispatcher_queue=dispatcher_task_queue, as_service=False, cooldown=3, ) + tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) + output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) + + request = InferenceRequest( + model_key=None, + callback=None, + raw_inputs=None, + input_keys=[tensor_key], + input_meta=None, + output_keys=[output_key], + raw_model=b"model", + batch_size=0, + ) + + model_id = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) + + request_batch = RequestBatch( + [request], + TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]), + model_id=model_id, + ) + + dispatcher_task_queue.put(request_batch) + return worker_manager, integrated_worker_type + + +@pytest.fixture +def setup_worker_manager_model_key( + test_dir: str, + monkeypatch: pytest.MonkeyPatch, + backbone_descriptor: str, + app_feature_store: FeatureStore, +): + integrated_worker_type = IntegratedTorchWorker + + chan = Channel.make_process_local() + queue = FLInterface(main_ch=chan) + monkeypatch.setenv( + "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + ) + # Put backbone descriptor into env var for the `EnvironmentConfigLoader` + monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) + + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) + + dispatcher_task_queue = mp.Queue(maxsize=0) + + worker_manager = WorkerManager( + config_loader=config_loader, + worker_type=integrated_worker_type, + dispatcher_queue=dispatcher_task_queue, + as_service=False, + cooldown=3, + ) + + tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) + output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) + model_id = FeatureStoreKey(key="model key", descriptor=app_feature_store.descriptor) + + request = InferenceRequest( + model_key=model_id, + callback=None, + raw_inputs=None, + input_keys=[tensor_key], + input_meta=None, + output_keys=[output_key], + raw_model=b"model", + batch_size=0, + ) + request_batch = RequestBatch( + [request], + TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]), + model_id=model_id, + ) + + dispatcher_task_queue.put(request_batch) + return worker_manager, integrated_worker_type + + +@pytest.fixture +def setup_request_dispatcher_model_bytes( + test_dir, + monkeypatch: pytest.MonkeyPatch, + backbone_descriptor: str, + app_feature_store: FeatureStore, +): + integrated_worker_type = IntegratedTorchWorker + + chan = Channel.make_process_local() + queue = FLInterface(main_ch=chan) + monkeypatch.setenv( + "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + ) + # Put backbone descriptor into env var for the `EnvironmentConfigLoader` + monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) + + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) + + request_dispatcher = RequestDispatcher( + batch_timeout=0, + batch_size=0, + config_loader=config_loader, + worker_type=integrated_worker_type, + ) + request_dispatcher._on_start() + tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) model = MessageHandler.build_model(b"model", "model name", "v 0.0.1") @@ -113,19 +244,20 @@ def setup_worker_manager_model_bytes( test_dir, model, [tensor_key], [output_key], [], None ) ser_request = MessageHandler.serialize_request(request) - worker_manager._task_queue.send(ser_request) - return worker_manager, integrated_worker + request_dispatcher._incoming_channel.send(ser_request) + + return request_dispatcher, integrated_worker_type @pytest.fixture -def setup_worker_manager_model_key( - test_dir: str, +def setup_request_dispatcher_model_key( + test_dir, monkeypatch: pytest.MonkeyPatch, backbone_descriptor: str, app_feature_store: FeatureStore, ): - integrated_worker = IntegratedTorchWorker() + integrated_worker_type = IntegratedTorchWorker chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) @@ -135,29 +267,33 @@ def setup_worker_manager_model_key( # Put backbone descriptor into env var for the `EnvironmentConfigLoader` monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) - worker_manager = WorkerManager( - EnvironmentConfigLoader( - featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_descriptor, - ), - integrated_worker, - as_service=False, - cooldown=3, + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) + + request_dispatcher = RequestDispatcher( + batch_timeout=0, + batch_size=0, + config_loader=config_loader, + worker_type=integrated_worker_type, ) + request_dispatcher._on_start() tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) model_key = MessageHandler.build_model_key( - "model key", app_feature_store.descriptor + key="model key", feature_store_descriptor=app_feature_store.descriptor ) request = MessageHandler.build_request( test_dir, model_key, [tensor_key], [output_key], [], None ) ser_request = MessageHandler.serialize_request(request) - worker_manager._task_queue.send(ser_request) - return worker_manager, integrated_worker + request_dispatcher._incoming_channel.send(ser_request) + + return request_dispatcher, integrated_worker_type def mock_pipeline_stage(monkeypatch: pytest.MonkeyPatch, integrated_worker, stage): @@ -167,7 +303,7 @@ def mock_stage(*args, **kwargs): monkeypatch.setattr(integrated_worker, stage, mock_stage) mock_reply_fn = MagicMock() monkeypatch.setattr( - "smartsim._core.mli.infrastructure.control.workermanager.build_failure_reply", + "smartsim._core.mli.infrastructure.control.error_handling.build_failure_reply", mock_reply_fn, ) @@ -193,21 +329,15 @@ def mock_exception_handler(exc, reply_channel, failure_message): "stage, error_message", [ pytest.param( - "fetch_model", "Failed while fetching the model.", id="fetch model" + "fetch_model", + "Error loading model on device or getting device.", + id="fetch model", ), pytest.param( "load_model", - "Failed while loading model from feature store.", + "Error loading model on device or getting device.", id="load model", ), - pytest.param( - "fetch_inputs", "Failed while fetching the inputs.", id="fetch inputs" - ), - pytest.param( - "transform_input", - "Failed while transforming the input.", - id="transform inputs", - ), pytest.param("execute", "Failed while executing.", id="execute"), pytest.param( "transform_output", @@ -219,7 +349,7 @@ def mock_exception_handler(exc, reply_channel, failure_message): ), ], ) -def test_pipeline_stage_errors_handled( +def test_wm_pipeline_stage_errors_handled( request, setup_worker_manager, monkeypatch: pytest.MonkeyPatch, @@ -227,7 +357,13 @@ def test_pipeline_stage_errors_handled( error_message: str, ): """Ensures that the worker manager does not crash after a failure in various pipeline stages""" - worker_manager, integrated_worker = request.getfixturevalue(setup_worker_manager) + worker_manager, integrated_worker_type = request.getfixturevalue( + setup_worker_manager + ) + integrated_worker = worker_manager._worker + + worker_manager._on_start() + device = worker_manager._device_manager._device mock_reply_fn = mock_pipeline_stage(monkeypatch, integrated_worker, stage) if stage not in ["fetch_model"]: @@ -236,42 +372,28 @@ def test_pipeline_stage_errors_handled( "fetch_model", MagicMock(return_value=FetchModelResult(b"result_bytes")), ) - if stage not in ["fetch_model", "load_model"]: monkeypatch.setattr( integrated_worker, "load_model", MagicMock(return_value=LoadModelResult(b"result_bytes")), ) - if stage not in ["fetch_model", "load_model", "fetch_inputs"]: monkeypatch.setattr( - integrated_worker, - "fetch_inputs", - MagicMock(return_value=FetchInputResult([b"result_bytes"], None)), - ) - if stage not in ["fetch_model", "load_model", "fetch_inputs", "transform_input"]: - monkeypatch.setattr( - integrated_worker, - "transform_input", - MagicMock(return_value=TransformInputResult(b"result_bytes")), + device, + "get_model", + MagicMock(return_value=b"result_bytes"), ) if stage not in [ "fetch_model", - "load_model", - "fetch_inputs", - "transform_input", "execute", ]: monkeypatch.setattr( integrated_worker, "execute", - MagicMock(return_value=ExecuteResult(b"result_bytes")), + MagicMock(return_value=ExecuteResult(b"result_bytes", [slice(0, 1)])), ) if stage not in [ "fetch_model", - "load_model", - "fetch_inputs", - "transform_input", "execute", "transform_output", ]: @@ -279,7 +401,7 @@ def test_pipeline_stage_errors_handled( integrated_worker, "transform_output", MagicMock( - return_value=TransformOutputResult(b"result", [], "c", "float32") + return_value=[TransformOutputResult(b"result", [], "c", "float32")] ), ) @@ -289,6 +411,56 @@ def test_pipeline_stage_errors_handled( mock_reply_fn.assert_called_with("fail", error_message) +@pytest.mark.parametrize( + "setup_request_dispatcher", + [ + pytest.param("setup_request_dispatcher_model_bytes"), + pytest.param("setup_request_dispatcher_model_key"), + ], +) +@pytest.mark.parametrize( + "stage, error_message", + [ + pytest.param( + "fetch_inputs", + "Error fetching input.", + id="fetch input", + ), + pytest.param( + "transform_input", + "Error Transforming input.", + id="transform input", + ), + ], +) +def test_dispatcher_pipeline_stage_errors_handled( + request, + setup_request_dispatcher, + monkeypatch: pytest.MonkeyPatch, + stage: str, + error_message: str, +): + """Ensures that the request dispatcher does not crash after a failure in various pipeline stages""" + request_dispatcher, integrated_worker_type = request.getfixturevalue( + setup_request_dispatcher + ) + integrated_worker = request_dispatcher._worker + + mock_reply_fn = mock_pipeline_stage(monkeypatch, integrated_worker, stage) + + if stage not in ["fetch_inputs"]: + monkeypatch.setattr( + integrated_worker, + "fetch_inputs", + MagicMock(return_value=[FetchInputResult(result=[b"result"], meta=None)]), + ) + + request_dispatcher._on_iteration() + + mock_reply_fn.assert_called_once() + mock_reply_fn.assert_called_with("fail", error_message) + + def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch): """Ensures that the worker manager does not crash after a failure in the execute pipeline stage""" @@ -296,7 +468,7 @@ def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch): mock_reply_fn = MagicMock() monkeypatch.setattr( - "smartsim._core.mli.infrastructure.control.workermanager.build_failure_reply", + "smartsim._core.mli.infrastructure.control.error_handling.build_failure_reply", mock_reply_fn, ) diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py new file mode 100644 index 0000000000..c8d97dd7ed --- /dev/null +++ b/tests/dragon/test_request_dispatcher.py @@ -0,0 +1,331 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import gc +import io +import logging +import pathlib +import socket +import time +import typing as t +from queue import Empty + +import numpy as np +import pytest + +torch = pytest.importorskip("torch") +dragon = pytest.importorskip("dragon") + +import base64 +import multiprocessing as mp + +try: + mp.set_start_method("dragon") +except Exception: + pass + +import os + +import dragon.channels as dch +import dragon.infrastructure.policy as dragon_policy +import dragon.infrastructure.process_desc as dragon_process_desc +import dragon.native.process as dragon_process +from dragon import fli +from dragon.channels import Channel +from dragon.data.ddict.ddict import DDict +from dragon.managed_memory import MemoryAlloc, MemoryPool +from dragon.mpbridge.queues import DragonQueue + +from smartsim._core.entrypoints.service import Service +from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel +from smartsim._core.mli.infrastructure.control.requestdispatcher import ( + RequestBatch, + RequestDispatcher, +) +from smartsim._core.mli.infrastructure.control.workermanager import ( + EnvironmentConfigLoader, +) +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( + DragonFeatureStore, +) +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker +from smartsim._core.mli.message_handler import MessageHandler +from smartsim.log import get_logger + +from .featurestore import FileSystemFeatureStore +from .utils.channel import FileSystemCommChannel + +logger = get_logger(__name__) +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + + +def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: + """Create a simple torch model and persist to disk for + testing purposes. + + TODO: remove once unit tests are in place""" + # test_path = pathlib.Path(work_dir) + if not model_path.parent.exists(): + model_path.parent.mkdir(parents=True, exist_ok=True) + + model_path.unlink(missing_ok=True) + + model = torch.nn.Linear(2, 1) + torch.save(model, model_path) + + return model_path + + +def mock_messages( + request_dispatcher_queue: DragonFLIChannel, + feature_store: FeatureStore, + feature_store_root_dir: pathlib.Path, + comm_channel_root_dir: pathlib.Path, +) -> None: + """Mock event producer for triggering the inference pipeline""" + feature_store_root_dir.mkdir(parents=True, exist_ok=True) + comm_channel_root_dir.mkdir(parents=True, exist_ok=True) + + model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt") + model_bytes = model_path.read_bytes() + model_key = str(feature_store_root_dir / "model_fs.pt") + + feature_store[model_key] = model_bytes + + for iteration_number in range(2): + + channel_key = Channel.make_process_local().serialize() + callback_channel = DragonCommChannel(channel_key) + + input_path = feature_store_root_dir / f"{iteration_number}/input.pt" + output_path = feature_store_root_dir / f"{iteration_number}/output.pt" + + input_key = str(input_path) + output_key = str(output_path) + + tensor = ( + (iteration_number + 1) * torch.ones((1, 2), dtype=torch.float32) + ).numpy() + fsd = feature_store.descriptor + + tensor_desc = MessageHandler.build_tensor_descriptor( + "c", "float32", list(tensor.shape) + ) + + message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) + message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd) + message_model_key = MessageHandler.build_model_key(model_key, fsd) + + request = MessageHandler.build_request( + reply_channel=base64.b64encode(callback_channel.descriptor).decode("utf-8"), + model=message_model_key, + inputs=[tensor_desc], + outputs=[message_tensor_output_key], + output_descriptors=[], + custom_attributes=None, + ) + request_bytes = MessageHandler.serialize_request(request) + with request_dispatcher_queue._fli.sendh( + timeout=None, stream_channel=request_dispatcher_queue._channel + ) as sendh: + sendh.send_bytes(request_bytes) + sendh.send_bytes(tensor.tobytes()) + time.sleep(1) + + +@pytest.fixture +def prepare_environment(test_dir: str) -> pathlib.Path: + """Cleanup prior outputs to run demo repeatedly""" + path = pathlib.Path(f"{test_dir}/workermanager.log") + logging.basicConfig(filename=path.absolute(), level=logging.DEBUG) + return path + + +def service_as_dragon_proc( + service: Service, cpu_affinity: list[int], gpu_affinity: list[int] +) -> dragon_process.Process: + + options = dragon_process_desc.ProcessOptions(make_inf_channels=True) + local_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=socket.gethostname(), + cpu_affinity=cpu_affinity, + gpu_affinity=gpu_affinity, + ) + return dragon_process.Process( + target=service.execute, + args=[], + cwd=os.getcwd(), + policy=local_policy, + options=options, + stderr=dragon_process.Popen.STDOUT, + stdout=dragon_process.Popen.STDOUT, + ) + + +def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: + """Test the request dispatcher batching and queueing system + + This also includes setting a queue to disposable, checking that it is no + longer referenced by the dispatcher. + """ + + test_path = prepare_environment + fs_path = test_path / "feature_store" + comm_path = test_path / "comm_store" + + to_worker_channel = dch.Channel.make_process_local() + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) + to_worker_fli_serialized = to_worker_fli.serialize() + + # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader + # or test environment may be unable to send messages w/queue + descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor + + ddict = DDict(1, 2, 4 * 1024**2) + dragon_fs = DragonFeatureStore(ddict) + + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) + integrated_worker_type = TorchWorker + + request_dispatcher = RequestDispatcher( + batch_timeout=0, + batch_size=2, + config_loader=config_loader, + worker_type=integrated_worker_type, + mem_pool_size=2 * 1024**2, + ) + + worker_queue = config_loader.get_queue() + if worker_queue is None: + logger.warn( + "FLI input queue not loaded correctly from config_loader: " + f"{config_loader._queue_descriptor}" + ) + + request_dispatcher._on_start() + + for _ in range(2): + batch: t.Optional[RequestBatch] = None + mem_allocs = [] + tensors = [] + fs_path = test_path / f"feature_store" + comm_path = test_path / f"comm_store" + model_key = str(fs_path / "model_fs.pt") + + # create a mock client application to populate the request queue + msg_pump = mp.Process( + target=mock_messages, + args=( + worker_queue, + dragon_fs, + fs_path, + comm_path, + ), + ) + + msg_pump.start() + + time.sleep(1) + + for attempts in range(15): + try: + request_dispatcher._on_iteration() + batch = request_dispatcher.task_queue.get(timeout=1) + break + except Empty: + continue + except Exception as exc: + raise exc + + try: + assert batch is not None + assert batch.has_valid_requests + + transform_result = batch.inputs + for transformed, dims, dtype in zip( + transform_result.transformed, + transform_result.dims, + transform_result.dtypes, + ): + mem_alloc = MemoryAlloc.attach(transformed) + mem_allocs.append(mem_alloc) + itemsize = np.empty((1), dtype=dtype).itemsize + tensors.append( + torch.from_numpy( + np.frombuffer( + mem_alloc.get_memview()[0 : np.prod(dims) * itemsize], + dtype=dtype, + ).reshape(dims) + ) + ) + + assert len(batch.requests) == 2 + assert batch.model_id.key == model_key + assert model_key in request_dispatcher._queues + assert model_key in request_dispatcher._active_queues + assert len(request_dispatcher._queues[model_key]) == 1 + assert request_dispatcher._queues[model_key][0].empty() + assert request_dispatcher._queues[model_key][0].model_id.key == model_key + assert len(tensors) == 1 + assert tensors[0].shape == torch.Size([2, 2]) + + for tensor in tensors: + for sample_idx in range(tensor.shape[0]): + tensor_in = tensor[sample_idx] + tensor_out = (sample_idx + 1) * torch.ones( + (2,), dtype=torch.float32 + ) + assert torch.equal(tensor_in, tensor_out) + + except Exception as exc: + raise exc + finally: + for mem_alloc in mem_allocs: + mem_alloc.free() + + msg_pump.kill() + + request_dispatcher._active_queues[model_key].make_disposable() + assert request_dispatcher._active_queues[model_key].can_be_removed + + request_dispatcher._on_iteration() + + assert model_key not in request_dispatcher._active_queues + assert model_key not in request_dispatcher._queues + + # Try to remove the dispatcher and free the memory + del request_dispatcher + gc.collect() diff --git a/tests/mli/test_torch_worker.py b/tests/dragon/test_torch_worker.py similarity index 61% rename from tests/mli/test_torch_worker.py rename to tests/dragon/test_torch_worker.py index 1e8bba7e33..88e800240f 100644 --- a/tests/mli/test_torch_worker.py +++ b/tests/dragon/test_torch_worker.py @@ -25,9 +25,15 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import io +import typing as t +import numpy as np import pytest import torch + +dragon = pytest.importorskip("dragon") +import dragon.globalservices.pool as dragon_gs_pool +from dragon.managed_memory import MemoryAlloc, MemoryPool from torch import nn from torch.nn import functional as F @@ -39,14 +45,15 @@ FetchModelResult, InferenceRequest, LoadModelResult, + RequestBatch, TransformInputResult, ) from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger logger = get_logger(__name__) -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_a +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon # simple MNIST in PyTorch @@ -60,7 +67,7 @@ def __init__(self): self.fc1 = nn.Linear(9216, 128) self.fc2 = nn.Linear(128, 10) - def forward(self, x): + def forward(self, x, y): x = self.conv1(x) x = F.relu(x) x = self.conv2(x) @@ -86,7 +93,7 @@ def get_batch() -> torch.Tensor: def create_torch_model(): n = Net() example_forward_input = get_batch() - module = torch.jit.trace(n, example_forward_input) + module = torch.jit.trace(n, [example_forward_input, example_forward_input]) model_buffer = io.BytesIO() torch.jit.save(module, model_buffer) return model_buffer.getvalue() @@ -113,18 +120,27 @@ def get_request() -> InferenceRequest: ) +def get_request_batch_from_request( + request: InferenceRequest, inputs: t.Optional[TransformInputResult] = None +) -> RequestBatch: + + return RequestBatch([request], inputs, request.model_key) + + sample_request: InferenceRequest = get_request() +sample_request_batch: RequestBatch = get_request_batch_from_request(sample_request) worker = TorchWorker() def test_load_model(mlutils) -> None: fetch_model_result = FetchModelResult(sample_request.raw_model) load_model_result = worker.load_model( - sample_request, fetch_model_result, mlutils.get_test_device().lower() + sample_request_batch, fetch_model_result, mlutils.get_test_device().lower() ) assert load_model_result.model( - get_batch().to(torch_device[mlutils.get_test_device().lower()]) + get_batch().to(torch_device[mlutils.get_test_device().lower()]), + get_batch().to(torch_device[mlutils.get_test_device().lower()]), ).shape == torch.Size((20, 10)) @@ -133,44 +149,73 @@ def test_transform_input(mlutils) -> None: sample_request.raw_inputs, sample_request.input_meta ) + mem_pool = MemoryPool.attach(dragon_gs_pool.create(1024**2).sdesc) + transform_input_result = worker.transform_input( - sample_request, fetch_input_result, mlutils.get_test_device().lower() + sample_request_batch, [fetch_input_result], mem_pool ) - assert all( - transformed.shape == get_batch().shape - for transformed in transform_input_result.transformed - ) + batch = get_batch().numpy() + assert transform_input_result.slices[0] == slice(0, batch.shape[0]) + + for tensor_index in range(2): + assert torch.Size(transform_input_result.dims[tensor_index]) == batch.shape + assert transform_input_result.dtypes[tensor_index] == str(batch.dtype) + mem_alloc = MemoryAlloc.attach(transform_input_result.transformed[tensor_index]) + itemsize = batch.itemsize + tensor = torch.from_numpy( + np.frombuffer( + mem_alloc.get_memview()[ + 0 : np.prod(transform_input_result.dims[tensor_index]) * itemsize + ], + dtype=transform_input_result.dtypes[tensor_index], + ).reshape(transform_input_result.dims[tensor_index]) + ) + + assert torch.equal( + tensor, torch.from_numpy(sample_request.raw_inputs[tensor_index]) + ) + + mem_pool.destroy() def test_execute(mlutils) -> None: load_model_result = LoadModelResult( Net().to(torch_device[mlutils.get_test_device().lower()]) ) - transform_result = TransformInputResult( - [ - get_batch().to(torch_device[mlutils.get_test_device().lower()]) - for _ in range(2) - ] + fetch_input_result = FetchInputResult( + sample_request.raw_inputs, sample_request.input_meta + ) + + request_batch = get_request_batch_from_request(sample_request, fetch_input_result) + + mem_pool = MemoryPool.attach(dragon_gs_pool.create(1024**2).sdesc) + + transform_result = worker.transform_input( + request_batch, [fetch_input_result], mem_pool ) - execute_result = worker.execute(sample_request, load_model_result, transform_result) + execute_result = worker.execute( + request_batch, + load_model_result, + transform_result, + mlutils.get_test_device().lower(), + ) assert all( result.shape == torch.Size((20, 10)) for result in execute_result.predictions ) + mem_pool.destroy() + def test_transform_output(mlutils): - execute_result = ExecuteResult([torch.rand((20, 10)) for _ in range(2)]) + tensors = [torch.rand((20, 10)) for _ in range(2)] + execute_result = ExecuteResult(tensors, [slice(0, 20)]) - transformed_output = worker.transform_output( - sample_request, execute_result, torch_device[mlutils.get_test_device().lower()] - ) + transformed_output = worker.transform_output(sample_request_batch, execute_result) - assert transformed_output.outputs == [ - item.numpy().tobytes() for item in execute_result.predictions - ] - assert transformed_output.shape == None - assert transformed_output.order == "c" - assert transformed_output.dtype == "float32" + assert transformed_output[0].outputs == [item.numpy().tobytes() for item in tensors] + assert transformed_output[0].shape == None + assert transformed_output[0].order == "c" + assert transformed_output[0].dtype == "float32" diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index c8332c260f..a334164257 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -26,7 +26,6 @@ import io import logging -import multiprocessing as mp import pathlib import time @@ -36,10 +35,18 @@ dragon = pytest.importorskip("dragon") import base64 +import multiprocessing as mp + +try: + mp.set_start_method("dragon") +except Exception: + pass + import os import dragon.channels as dch from dragon import fli +from dragon.mpbridge.queues import DragonQueue from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel @@ -174,14 +181,15 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: callback_factory=FileSystemCommChannel.from_descriptor, queue_factory=DragonFLIChannel.from_descriptor, ) - integrated_worker = TorchWorker() + integrated_worker_type = TorchWorker worker_manager = WorkerManager( config_loader, - integrated_worker, + integrated_worker_type, as_service=True, cooldown=5, device="cpu", + dispatcher_queue=mp.Queue(maxsize=0), ) worker_queue = config_loader.get_queue() From 128598b521f4cb26f2d35b41752aa84e1348425e Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Thu, 29 Aug 2024 10:32:55 -0700 Subject: [PATCH 21/60] Refactor `exception_handler` to avoid unnecessary building and serialization of failure responses. (#687) In this PR I fix the `exception_handler` so that it only builds and serializes a failure response if a reply channel is not None. I also needed to tweak the tests a bit by mocking out the reply channels. [ committed by @AlyssaCote ] [ approved by @mellis13 @al-rigazzi ] --- doc/changelog.md | 1 + .../infrastructure/control/error_handling.py | 8 ++++---- tests/dragon/test_error_handling.py | 18 +++++++++++++++--- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/doc/changelog.md b/doc/changelog.md index ac09ecf604..4ce6cf586c 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Refactor `exception_handler` - Add RequestDispatcher and the possibility of batching inference requests - Enable hostname selection for dragon tasks - Remove pydantic dependency from MLI code diff --git a/smartsim/_core/mli/infrastructure/control/error_handling.py b/smartsim/_core/mli/infrastructure/control/error_handling.py index e2c5bcd9e1..5a42a8bfa8 100644 --- a/smartsim/_core/mli/infrastructure/control/error_handling.py +++ b/smartsim/_core/mli/infrastructure/control/error_handling.py @@ -61,10 +61,10 @@ def exception_handler( f"Exception type: {type(exc).__name__}\n" f"Exception message: {str(exc)}" ) - serialized_resp = MessageHandler.serialize_response( - build_failure_reply("fail", failure_message) - ) if reply_channel: + serialized_resp = MessageHandler.serialize_response( + build_failure_reply("fail", failure_message) + ) reply_channel.send(serialized_resp) else: - logger.warning("Unable to notify client of error without reply_channel") + logger.warning("Unable to notify client of error without a reply channel") diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index b20424866a..0e737101fa 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -307,14 +307,22 @@ def mock_stage(*args, **kwargs): mock_reply_fn, ) + mock_reply_channel = MagicMock() + mock_reply_channel.send = MagicMock() + def mock_exception_handler(exc, reply_channel, failure_message): - return exception_handler(exc, None, failure_message) + return exception_handler(exc, mock_reply_channel, failure_message) monkeypatch.setattr( "smartsim._core.mli.infrastructure.control.workermanager.exception_handler", mock_exception_handler, ) + monkeypatch.setattr( + "smartsim._core.mli.infrastructure.control.requestdispatcher.exception_handler", + mock_exception_handler, + ) + return mock_reply_fn @@ -464,7 +472,9 @@ def test_dispatcher_pipeline_stage_errors_handled( def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch): """Ensures that the worker manager does not crash after a failure in the execute pipeline stage""" - reply = InferenceReply() + + mock_reply_channel = MagicMock() + mock_reply_channel.send = MagicMock() mock_reply_fn = MagicMock() monkeypatch.setattr( @@ -473,7 +483,9 @@ def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch): ) test_exception = ValueError("Test ValueError") - exception_handler(test_exception, None, "Failure while fetching the model.") + exception_handler( + test_exception, mock_reply_channel, "Failure while fetching the model." + ) mock_reply_fn.assert_called_once() mock_reply_fn.assert_called_with("fail", "Failure while fetching the model.") From 8aa990ca502bcbb005fd43751878b1b8f55942cf Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Thu, 29 Aug 2024 11:40:27 -0700 Subject: [PATCH 22/60] SmartSim environment variables updated using new naming convention (#666) Updates SmartSim environment variable names with the new naming convention. [ committed by @AlyssaCote ] [ approved by @ashao ] --- doc/changelog.md | 1 + smartsim/_core/_cli/build.py | 12 ++++++------ smartsim/_core/_install/builder.py | 8 ++++++-- smartsim/_core/config/config.py | 31 +++++++++++++++++------------- smartsim/_core/utils/helpers.py | 2 +- smartsim/database/orchestrator.py | 4 ++-- smartsim/experiment.py | 2 +- tests/on_wlm/test_dragon.py | 2 +- tests/test_config.py | 28 +++++++++++++-------------- 9 files changed, 50 insertions(+), 40 deletions(-) diff --git a/doc/changelog.md b/doc/changelog.md index 4ce6cf586c..bfe42c804c 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Update SmartSim environment variables using new naming convention - Refactor `exception_handler` - Add RequestDispatcher and the possibility of batching inference requests - Enable hostname selection for dragon tasks diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py index 951521f171..65a5504c6f 100644 --- a/smartsim/_core/_cli/build.py +++ b/smartsim/_core/_cli/build.py @@ -78,22 +78,22 @@ def check_py_tf_version(versions: Versioner) -> None: def check_backends_install() -> bool: """Checks if backends have already been installed. Logs details on how to proceed forward - if the RAI_PATH environment variable is set or if + if the SMARTSIM_RAI_LIB environment variable is set or if backends have already been installed. """ - rai_path = os.environ.get("RAI_PATH", "") + rai_path = os.environ.get("SMARTSIM_RAI_LIB", "") installed = installed_redisai_backends() msg = "" if rai_path and installed: msg = ( f"There is no need to build. backends are already built and " - f"specified in the environment at 'RAI_PATH': {CONFIG.redisai}" + f"specified in the environment at 'SMARTSIM_RAI_LIB': {CONFIG.redisai}" ) elif rai_path and not installed: msg = ( - "Before running 'smart build', unset your RAI_PATH environment " - "variable with 'unset RAI_PATH'." + "Before running 'smart build', unset your SMARTSIM_RAI_LIB environment " + "variable with 'unset SMARTSIM_RAI_LIB'." ) elif not rai_path and installed: msg = ( @@ -368,7 +368,7 @@ def _configure_keydb_build(versions: Versioner) -> None: CONFIG.conf_path = Path(CONFIG.core_path, "config", "keydb.conf") if not CONFIG.conf_path.resolve().is_file(): raise SSConfigError( - "Database configuration file at REDIS_CONF could not be found" + "Database configuration file at SMARTSIM_REDIS_CONF could not be found" ) diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py index 8f5bdc5570..e41fe2342d 100644 --- a/smartsim/_core/_install/builder.py +++ b/smartsim/_core/_install/builder.py @@ -342,7 +342,9 @@ def build_from_git( bin_path = Path(dependency_path, "bin").resolve() try: database_exe = next(bin_path.glob("*-server")) - database = Path(os.environ.get("REDIS_PATH", database_exe)).resolve() + database = Path( + os.environ.get("SMARTSIM_REDIS_SERVER_EXE", database_exe) + ).resolve() _ = expand_exe_path(str(database)) except (TypeError, FileNotFoundError) as e: raise BuildError("Installation of redis-server failed!") from e @@ -350,7 +352,9 @@ def build_from_git( # validate install -- redis-cli try: redis_cli_exe = next(bin_path.glob("*-cli")) - redis_cli = Path(os.environ.get("REDIS_CLI_PATH", redis_cli_exe)).resolve() + redis_cli = Path( + os.environ.get("SMARTSIM_REDIS_CLI_EXE", redis_cli_exe) + ).resolve() _ = expand_exe_path(str(redis_cli)) except (TypeError, FileNotFoundError) as e: raise BuildError("Installation of redis-cli failed!") from e diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 9cf950b215..98e895a7d0 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -40,19 +40,19 @@ # These values can be set through environment variables to # override the default behavior of SmartSim. # -# RAI_PATH +# SMARTSIM_RAI_LIB # - Path to the RAI shared library # - Default: /smartsim/smartsim/_core/lib/redisai.so # -# REDIS_CONF +# SMARTSIM_REDIS_CONF # - Path to the redis.conf file # - Default: /SmartSim/smartsim/_core/config/redis.conf # -# REDIS_PATH +# SMARTSIM_REDIS_SERVER_EXE # - Path to the redis-server executable # - Default: /SmartSim/smartsim/_core/bin/redis-server # -# REDIS_CLI_PATH +# SMARTSIM_REDIS_CLI_EXE # - Path to the redis-cli executable # - Default: /SmartSim/smartsim/_core/bin/redis-cli # @@ -105,20 +105,20 @@ def __init__(self) -> None: @property def redisai(self) -> str: rai_path = self.lib_path / "redisai.so" - redisai = Path(os.environ.get("RAI_PATH", rai_path)).resolve() + redisai = Path(os.environ.get("SMARTSIM_RAI_LIB", rai_path)).resolve() if not redisai.is_file(): raise SSConfigError( "RedisAI dependency not found. Build with `smart` cli " - "or specify RAI_PATH" + "or specify SMARTSIM_RAI_LIB" ) return str(redisai) @property def database_conf(self) -> str: - conf = Path(os.environ.get("REDIS_CONF", self.conf_path)).resolve() + conf = Path(os.environ.get("SMARTSIM_REDIS_CONF", self.conf_path)).resolve() if not conf.is_file(): raise SSConfigError( - "Database configuration file at REDIS_CONF could not be found" + "Database configuration file at SMARTSIM_REDIS_CONF could not be found" ) return str(conf) @@ -126,24 +126,29 @@ def database_conf(self) -> str: def database_exe(self) -> str: try: database_exe = next(self.bin_path.glob("*-server")) - database = Path(os.environ.get("REDIS_PATH", database_exe)).resolve() + database = Path( + os.environ.get("SMARTSIM_REDIS_SERVER_EXE", database_exe) + ).resolve() exe = expand_exe_path(str(database)) return exe except (TypeError, FileNotFoundError) as e: raise SSConfigError( - "Specified database binary at REDIS_PATH could not be used" + "Specified database binary at SMARTSIM_REDIS_SERVER_EXE " + "could not be used" ) from e @property def database_cli(self) -> str: try: redis_cli_exe = next(self.bin_path.glob("*-cli")) - redis_cli = Path(os.environ.get("REDIS_CLI_PATH", redis_cli_exe)).resolve() + redis_cli = Path( + os.environ.get("SMARTSIM_REDIS_CLI_EXE", redis_cli_exe) + ).resolve() exe = expand_exe_path(str(redis_cli)) return exe except (TypeError, FileNotFoundError) as e: raise SSConfigError( - "Specified Redis binary at REDIS_CLI_PATH could not be used" + "Specified Redis binary at SMARTSIM_REDIS_CLI_EXE could not be used" ) from e @property @@ -163,7 +168,7 @@ def dragon_dotenv(self) -> Path: def dragon_server_path(self) -> t.Optional[str]: return os.getenv( "SMARTSIM_DRAGON_SERVER_PATH", - os.getenv("SMARTSIM_DRAGON_SERVER_PATH_EXP", None), + os.getenv("_SMARTSIM_DRAGON_SERVER_PATH_EXP", None), ) @property diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index df2c016a17..f82215f03a 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -221,7 +221,7 @@ def _installed(base_path: Path, backend: str) -> bool: """ backend_key = f"redisai_{backend}" backend_path = base_path / backend_key / f"{backend_key}.so" - backend_so = Path(os.environ.get("RAI_PATH", backend_path)).resolve() + backend_so = Path(os.environ.get("SMARTSIM_RAI_LIB", backend_path)).resolve() return backend_so.is_file() diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index e2549891af..e5e99c8932 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -265,8 +265,8 @@ def __init__( raise SSConfigError( "SmartSim not installed with pre-built extensions (Redis)\n" "Use the `smart` cli tool to install needed extensions\n" - "or set REDIS_PATH and REDIS_CLI_PATH in your environment\n" - "See documentation for more information" + "or set SMARTSIM_REDIS_SERVER_EXE and SMARTSIM_REDIS_CLI_EXE " + "in your environment\nSee documentation for more information" ) from e if self.launcher != "local": diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 607a90ae16..9a14eecdc8 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -178,7 +178,7 @@ def __init__( def _set_dragon_server_path(self) -> None: """Set path for dragon server through environment varialbes""" if not "SMARTSIM_DRAGON_SERVER_PATH" in environ: - environ["SMARTSIM_DRAGON_SERVER_PATH_EXP"] = osp.join( + environ["_SMARTSIM_DRAGON_SERVER_PATH_EXP"] = osp.join( self.exp_path, CONFIG.dragon_default_subdir ) diff --git a/tests/on_wlm/test_dragon.py b/tests/on_wlm/test_dragon.py index a05d381415..1bef3cac8d 100644 --- a/tests/on_wlm/test_dragon.py +++ b/tests/on_wlm/test_dragon.py @@ -56,7 +56,7 @@ def test_dragon_global_path(global_dragon_teardown, wlmutils, test_dir, monkeypa def test_dragon_exp_path(global_dragon_teardown, wlmutils, test_dir, monkeypatch): monkeypatch.delenv("SMARTSIM_DRAGON_SERVER_PATH", raising=False) - monkeypatch.delenv("SMARTSIM_DRAGON_SERVER_PATH_EXP", raising=False) + monkeypatch.delenv("_SMARTSIM_DRAGON_SERVER_PATH_EXP", raising=False) exp: Experiment = Experiment( "test_dragon_connection", exp_path=test_dir, diff --git a/tests/test_config.py b/tests/test_config.py index 00a1fcdd36..5a84103ffd 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -66,9 +66,9 @@ def get_redisai_env( """ env = os.environ.copy() if rai_path is not None: - env["RAI_PATH"] = rai_path + env["SMARTSIM_RAI_LIB"] = rai_path else: - env.pop("RAI_PATH", None) + env.pop("SMARTSIM_RAI_LIB", None) if lib_path is not None: env["SMARTSIM_DEP_INSTALL_PATH"] = lib_path @@ -85,7 +85,7 @@ def make_file(filepath: str) -> None: def test_redisai_invalid_rai_path(test_dir, monkeypatch): - """An invalid RAI_PATH and valid SMARTSIM_DEP_INSTALL_PATH should fail""" + """An invalid SMARTSIM_RAI_LIB and valid SMARTSIM_DEP_INSTALL_PATH should fail""" rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") make_file(os.path.join(test_dir, "lib", "redisai.so")) @@ -94,7 +94,7 @@ def test_redisai_invalid_rai_path(test_dir, monkeypatch): config = Config() - # Fail when no file exists @ RAI_PATH + # Fail when no file exists @ SMARTSIM_RAI_LIB with pytest.raises(SSConfigError) as ex: _ = config.redisai @@ -102,7 +102,7 @@ def test_redisai_invalid_rai_path(test_dir, monkeypatch): def test_redisai_valid_rai_path(test_dir, monkeypatch): - """A valid RAI_PATH should override valid SMARTSIM_DEP_INSTALL_PATH and succeed""" + """A valid SMARTSIM_RAI_LIB should override valid SMARTSIM_DEP_INSTALL_PATH and succeed""" rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") make_file(rai_file_path) @@ -117,7 +117,7 @@ def test_redisai_valid_rai_path(test_dir, monkeypatch): def test_redisai_invalid_lib_path(test_dir, monkeypatch): - """Invalid RAI_PATH and invalid SMARTSIM_DEP_INSTALL_PATH should fail""" + """Invalid SMARTSIM_RAI_LIB and invalid SMARTSIM_DEP_INSTALL_PATH should fail""" rai_file_path = f"{test_dir}/railib/redisai.so" @@ -133,7 +133,7 @@ def test_redisai_invalid_lib_path(test_dir, monkeypatch): def test_redisai_valid_lib_path(test_dir, monkeypatch): - """Valid RAI_PATH and invalid SMARTSIM_DEP_INSTALL_PATH should succeed""" + """Valid SMARTSIM_RAI_LIB and invalid SMARTSIM_DEP_INSTALL_PATH should succeed""" rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") make_file(rai_file_path) @@ -147,7 +147,7 @@ def test_redisai_valid_lib_path(test_dir, monkeypatch): def test_redisai_valid_lib_path_null_rai(test_dir, monkeypatch): - """Missing RAI_PATH and valid SMARTSIM_DEP_INSTALL_PATH should succeed""" + """Missing SMARTSIM_RAI_LIB and valid SMARTSIM_DEP_INSTALL_PATH should succeed""" rai_file_path: t.Optional[str] = None lib_file_path = os.path.join(test_dir, "lib", "redisai.so") @@ -166,11 +166,11 @@ def test_redis_conf(): assert Path(config.database_conf).is_file() assert isinstance(config.database_conf, str) - os.environ["REDIS_CONF"] = "not/a/path" + os.environ["SMARTSIM_REDIS_CONF"] = "not/a/path" config = Config() with pytest.raises(SSConfigError): config.database_conf - os.environ.pop("REDIS_CONF") + os.environ.pop("SMARTSIM_REDIS_CONF") def test_redis_exe(): @@ -178,11 +178,11 @@ def test_redis_exe(): assert Path(config.database_exe).is_file() assert isinstance(config.database_exe, str) - os.environ["REDIS_PATH"] = "not/a/path" + os.environ["SMARTSIM_REDIS_SERVER_EXE"] = "not/a/path" config = Config() with pytest.raises(SSConfigError): config.database_exe - os.environ.pop("REDIS_PATH") + os.environ.pop("SMARTSIM_REDIS_SERVER_EXE") def test_redis_cli(): @@ -190,11 +190,11 @@ def test_redis_cli(): assert Path(config.redisai).is_file() assert isinstance(config.redisai, str) - os.environ["REDIS_CLI_PATH"] = "not/a/path" + os.environ["SMARTSIM_REDIS_CLI_EXE"] = "not/a/path" config = Config() with pytest.raises(SSConfigError): config.database_cli - os.environ.pop("REDIS_CLI_PATH") + os.environ.pop("SMARTSIM_REDIS_CLI_EXE") @pytest.mark.parametrize( From f6d55d8d7bd316331e939aacfe121ab87bdd04e5 Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Fri, 30 Aug 2024 14:41:20 -0700 Subject: [PATCH 23/60] MLI file names conform to snake case (#689) Update MLI filenames to be snake case. --- doc/changelog.md | 1 + ex/high_throughput_inference/mli_driver.py | 2 +- ex/high_throughput_inference/mock_app.py | 2 +- ...manager.py => standalone_worker_manager.py} | 14 +++++++------- .../{dragonchannel.py => dragon_channel.py} | 0 .../channel/{dragonfli.py => dragon_fli.py} | 0 .../{devicemanager.py => device_manager.py} | 2 +- ...uestdispatcher.py => request_dispatcher.py} | 6 +++--- .../{workermanager.py => worker_manager.py} | 10 +++++----- ...ironmentloader.py => environment_loader.py} | 2 +- ...featurestore.py => dragon_feature_store.py} | 2 +- .../{featurestore.py => feature_store.py} | 0 .../_core/mli/infrastructure/worker/worker.py | 2 +- .../{featurestore.py => feature_store.py} | 2 +- .../test_core_machine_learning_worker.py | 4 ++-- tests/dragon/test_device_manager.py | 4 ++-- tests/dragon/test_environment_loader.py | 8 ++++---- tests/dragon/test_error_handling.py | 18 +++++++++--------- tests/dragon/test_reply_building.py | 2 +- tests/dragon/test_request_dispatcher.py | 14 +++++++------- tests/dragon/test_torch_worker.py | 2 +- tests/dragon/test_worker_manager.py | 10 +++++----- .../mli/{featurestore.py => feature_store.py} | 2 +- 23 files changed, 55 insertions(+), 54 deletions(-) rename ex/high_throughput_inference/{standalone_workermanager.py => standalone_worker_manager.py} (92%) rename smartsim/_core/mli/comm/channel/{dragonchannel.py => dragon_channel.py} (100%) rename smartsim/_core/mli/comm/channel/{dragonfli.py => dragon_fli.py} (100%) rename smartsim/_core/mli/infrastructure/control/{devicemanager.py => device_manager.py} (98%) rename smartsim/_core/mli/infrastructure/control/{requestdispatcher.py => request_dispatcher.py} (99%) rename smartsim/_core/mli/infrastructure/control/{workermanager.py => worker_manager.py} (98%) rename smartsim/_core/mli/infrastructure/{environmentloader.py => environment_loader.py} (98%) rename smartsim/_core/mli/infrastructure/storage/{dragonfeaturestore.py => dragon_feature_store.py} (98%) rename smartsim/_core/mli/infrastructure/storage/{featurestore.py => feature_store.py} (100%) rename tests/dragon/{featurestore.py => feature_store.py} (98%) rename tests/mli/{featurestore.py => feature_store.py} (98%) diff --git a/doc/changelog.md b/doc/changelog.md index bfe42c804c..004be997a0 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Filenames conform to snake case - Update SmartSim environment variables using new naming convention - Refactor `exception_handler` - Add RequestDispatcher and the possibility of batching inference requests diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 807a70b219..36f427937c 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -13,7 +13,7 @@ NUM_RANKS = 4 NUM_WORKERS = 1 filedir = os.path.dirname(__file__) -worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") +worker_manager_script_name = os.path.join(filedir, "standalone_worker_manager.py") app_script_name = os.path.join(filedir, "mock_app.py") model_name = os.path.join(filedir, f"resnet50.{DEVICE}.pt") diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 517d18fb2f..dcc52296ef 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -43,7 +43,7 @@ import torch from mpi4py import MPI -from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) from smartsim._core.mli.message_handler import MessageHandler diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_worker_manager.py similarity index 92% rename from ex/high_throughput_inference/standalone_workermanager.py rename to ex/high_throughput_inference/standalone_worker_manager.py index 0b8c61251b..feb1af1aee 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_worker_manager.py @@ -58,17 +58,17 @@ from smartsim._core.entrypoints.service import Service from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel -from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel -from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.control.requestdispatcher import ( +from smartsim._core.mli.infrastructure.control.request_dispatcher import ( RequestDispatcher, ) -from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager -from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader -from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( +from smartsim._core.mli.infrastructure.control.worker_manager import WorkerManager +from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py similarity index 100% rename from smartsim/_core/mli/comm/channel/dragonchannel.py rename to smartsim/_core/mli/comm/channel/dragon_channel.py diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py similarity index 100% rename from smartsim/_core/mli/comm/channel/dragonfli.py rename to smartsim/_core/mli/comm/channel/dragon_fli.py diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/device_manager.py similarity index 98% rename from smartsim/_core/mli/infrastructure/control/devicemanager.py rename to smartsim/_core/mli/infrastructure/control/device_manager.py index 3570bd51ed..54d58507ee 100644 --- a/smartsim/_core/mli/infrastructure/control/devicemanager.py +++ b/smartsim/_core/mli/infrastructure/control/device_manager.py @@ -28,7 +28,7 @@ from contextlib import _GeneratorContextManager, contextmanager from .....log import get_logger -from ...infrastructure.storage.featurestore import FeatureStore +from ..storage.feature_store import FeatureStore from ..worker.worker import MachineLearningWorkerBase, RequestBatch logger = get_logger(__name__) diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py similarity index 99% rename from smartsim/_core/mli/infrastructure/control/requestdispatcher.py rename to smartsim/_core/mli/infrastructure/control/request_dispatcher.py index d56912a8f0..513dc5f639 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -47,9 +47,9 @@ from .....error import SmartSimError from .....log import get_logger from ....utils.timings import PerfTimer -from ...infrastructure.environmentloader import EnvironmentConfigLoader -from ...infrastructure.storage.featurestore import FeatureStore -from ...infrastructure.worker.worker import ( +from ..environment_loader import EnvironmentConfigLoader +from ..storage.feature_store import FeatureStore +from ..worker.worker import ( InferenceRequest, MachineLearningWorkerBase, ModelIdentifier, diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/worker_manager.py similarity index 98% rename from smartsim/_core/mli/infrastructure/control/workermanager.py rename to smartsim/_core/mli/infrastructure/control/worker_manager.py index 54a245b813..d831a879aa 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/worker_manager.py @@ -38,20 +38,20 @@ import typing as t from queue import Empty -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore from .....log import get_logger from ....entrypoints.service import Service from ....utils.timings import PerfTimer -from ...infrastructure.environmentloader import EnvironmentConfigLoader -from ...infrastructure.worker.worker import ( +from ...message_handler import MessageHandler +from ..environment_loader import EnvironmentConfigLoader +from ..worker.worker import ( InferenceReply, LoadModelResult, MachineLearningWorkerBase, RequestBatch, ) -from ...message_handler import MessageHandler -from .devicemanager import DeviceManager, WorkerDevice +from .device_manager import DeviceManager, WorkerDevice from .error_handling import build_failure_reply, exception_handler if t.TYPE_CHECKING: diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environment_loader.py similarity index 98% rename from smartsim/_core/mli/infrastructure/environmentloader.py rename to smartsim/_core/mli/infrastructure/environment_loader.py index 99202ef2ea..c8b158a5ad 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environment_loader.py @@ -28,7 +28,7 @@ import typing as t from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore from smartsim.log import get_logger logger = get_logger(__name__) diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py similarity index 98% rename from smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py rename to smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py index e89abcd2a2..aee4aac529 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py @@ -32,7 +32,7 @@ # isort: on -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore from smartsim.error import SmartSimError from smartsim.log import get_logger diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/feature_store.py similarity index 100% rename from smartsim/_core/mli/infrastructure/storage/featurestore.py rename to smartsim/_core/mli/infrastructure/storage/feature_store.py diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 25e4dc49f7..41de23b561 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -37,9 +37,9 @@ from .....error import SmartSimError from .....log import get_logger from ...comm.channel.channel import CommChannelBase -from ...infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey from ...message_handler import MessageHandler from ...mli_schemas.model.model_capnp import Model +from ..storage.feature_store import FeatureStore, FeatureStoreKey if t.TYPE_CHECKING: from smartsim._core.mli.mli_schemas.response.response_capnp import Status diff --git a/tests/dragon/featurestore.py b/tests/dragon/feature_store.py similarity index 98% rename from tests/dragon/featurestore.py rename to tests/dragon/feature_store.py index d06035fd70..178b675e64 100644 --- a/tests/dragon/featurestore.py +++ b/tests/dragon/feature_store.py @@ -28,7 +28,7 @@ import typing as t import smartsim.error as sse -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore from smartsim.log import get_logger logger = get_logger(__name__) diff --git a/tests/dragon/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py index 231a971241..ed9ac625cd 100644 --- a/tests/dragon/test_core_machine_learning_worker.py +++ b/tests/dragon/test_core_machine_learning_worker.py @@ -34,7 +34,7 @@ import torch import smartsim.error as sse -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStoreKey +from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey from smartsim._core.mli.infrastructure.worker.worker import ( InferenceRequest, MachineLearningWorkerCore, @@ -44,7 +44,7 @@ ) from smartsim._core.utils import installed_redisai_backends -from .featurestore import FileSystemFeatureStore, MemoryFeatureStore +from .feature_store import FileSystemFeatureStore, MemoryFeatureStore # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py index 8edeb60fbb..c58879cb62 100644 --- a/tests/dragon/test_device_manager.py +++ b/tests/dragon/test_device_manager.py @@ -30,11 +30,11 @@ dragon = pytest.importorskip("dragon") -from smartsim._core.mli.infrastructure.control.devicemanager import ( +from smartsim._core.mli.infrastructure.control.device_manager import ( DeviceManager, WorkerDevice, ) -from smartsim._core.mli.infrastructure.storage.featurestore import ( +from smartsim._core.mli.infrastructure.storage.feature_store import ( FeatureStore, FeatureStoreKey, ) diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index 8f2716488b..c3331336e5 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -33,10 +33,10 @@ from dragon.data.ddict.ddict import DDict from dragon.fli import DragonFLIError, FLInterface -from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel -from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel -from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader -from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 0e737101fa..7f823a1c43 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -38,20 +38,20 @@ from dragon.fli import FLInterface from dragon.mpbridge.queues import DragonQueue -from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel -from smartsim._core.mli.infrastructure.control.devicemanager import WorkerDevice -from smartsim._core.mli.infrastructure.control.requestdispatcher import ( +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.infrastructure.control.device_manager import WorkerDevice +from smartsim._core.mli.infrastructure.control.request_dispatcher import ( RequestDispatcher, ) -from smartsim._core.mli.infrastructure.control.workermanager import ( +from smartsim._core.mli.infrastructure.control.worker_manager import ( WorkerManager, exception_handler, ) -from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader -from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( +from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.featurestore import ( +from smartsim._core.mli.infrastructure.storage.feature_store import ( FeatureStore, FeatureStoreKey, ) @@ -314,12 +314,12 @@ def mock_exception_handler(exc, reply_channel, failure_message): return exception_handler(exc, mock_reply_channel, failure_message) monkeypatch.setattr( - "smartsim._core.mli.infrastructure.control.workermanager.exception_handler", + "smartsim._core.mli.infrastructure.control.worker_manager.exception_handler", mock_exception_handler, ) monkeypatch.setattr( - "smartsim._core.mli.infrastructure.control.requestdispatcher.exception_handler", + "smartsim._core.mli.infrastructure.control.request_dispatcher.exception_handler", mock_exception_handler, ) diff --git a/tests/dragon/test_reply_building.py b/tests/dragon/test_reply_building.py index 5f179bbae0..7a8e637803 100644 --- a/tests/dragon/test_reply_building.py +++ b/tests/dragon/test_reply_building.py @@ -30,7 +30,7 @@ dragon = pytest.importorskip("dragon") -from smartsim._core.mli.infrastructure.control.workermanager import build_failure_reply +from smartsim._core.mli.infrastructure.control.worker_manager import build_failure_reply from smartsim._core.mli.infrastructure.worker.worker import InferenceReply if t.TYPE_CHECKING: diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index c8d97dd7ed..eeb8cd238b 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -61,24 +61,24 @@ from smartsim._core.entrypoints.service import Service from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel -from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel -from smartsim._core.mli.infrastructure.control.requestdispatcher import ( +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.infrastructure.control.request_dispatcher import ( RequestBatch, RequestDispatcher, ) -from smartsim._core.mli.infrastructure.control.workermanager import ( +from smartsim._core.mli.infrastructure.control.worker_manager import ( EnvironmentConfigLoader, ) -from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger -from .featurestore import FileSystemFeatureStore +from .feature_store import FileSystemFeatureStore from .utils.channel import FileSystemCommChannel logger = get_logger(__name__) diff --git a/tests/dragon/test_torch_worker.py b/tests/dragon/test_torch_worker.py index 88e800240f..9a5ed6309f 100644 --- a/tests/dragon/test_torch_worker.py +++ b/tests/dragon/test_torch_worker.py @@ -37,7 +37,7 @@ from torch import nn from torch.nn import functional as F -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStoreKey +from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index a334164257..1ebc512a50 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -49,20 +49,20 @@ from dragon.mpbridge.queues import DragonQueue from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel -from smartsim._core.mli.infrastructure.control.workermanager import ( +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.infrastructure.control.worker_manager import ( EnvironmentConfigLoader, WorkerManager, ) -from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger -from .featurestore import FileSystemFeatureStore +from .feature_store import FileSystemFeatureStore from .utils.channel import FileSystemCommChannel logger = get_logger(__name__) diff --git a/tests/mli/featurestore.py b/tests/mli/feature_store.py similarity index 98% rename from tests/mli/featurestore.py rename to tests/mli/feature_store.py index de748ae6e5..7ecc01814c 100644 --- a/tests/mli/featurestore.py +++ b/tests/mli/feature_store.py @@ -28,7 +28,7 @@ import typing as t import smartsim.error as sse -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore from smartsim.log import get_logger logger = get_logger(__name__) From a3795e7a828ad724e01e6637fafc0d57bf05ad9a Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 10 Sep 2024 12:55:37 -0400 Subject: [PATCH 24/60] Add event broadcasting capability (#672) Event broadcasting will enable the system to notify other MLI resources of changes. This PR contains the base capabilities required for publishing & consuming channel messages as events. [ committed by @ankona ] [ reviewed by @mellis13 @al-rigazzi @AlyssaCote ] --- doc/changelog.md | 1 + smartsim/_core/mli/comm/channel/channel.py | 11 +- .../_core/mli/comm/channel/dragon_channel.py | 131 +++- smartsim/_core/mli/comm/channel/dragon_fli.py | 34 +- .../control/request_dispatcher.py | 11 +- .../storage/backbone_feature_store.py | 405 ++++++++++ .../storage/dragon_feature_store.py | 56 +- .../infrastructure/storage/feature_store.py | 107 ++- smartsim/_core/utils/timings.py | 42 +- tests/dragon/feature_store.py | 88 +-- tests/dragon/test_featurestore_base.py | 722 ++++++++++++++++++ tests/dragon/test_featurestore_integration.py | 267 +++++++ tests/dragon/test_request_dispatcher.py | 6 +- tests/dragon/utils/channel.py | 63 +- tests/mli/channel.py | 68 +- tests/mli/feature_store.py | 43 +- tests/test_message_handler/test_request.py | 8 +- 17 files changed, 1875 insertions(+), 188 deletions(-) create mode 100644 smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py create mode 100644 tests/dragon/test_featurestore_base.py create mode 100644 tests/dragon/test_featurestore_integration.py diff --git a/doc/changelog.md b/doc/changelog.md index 004be997a0..b6f134d2a5 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Implement asynchronous notifications for shared data - Filenames conform to snake case - Update SmartSim environment variables using new naming convention - Refactor `exception_handler` diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index d918591264..09d3ac62b7 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import base64 import typing as t from abc import ABC, abstractmethod @@ -40,20 +41,22 @@ def __init__(self, descriptor: t.Union[str, bytes]) -> None: self._descriptor = descriptor @abstractmethod - def send(self, value: bytes) -> None: + def send(self, value: bytes, timeout: float = 0) -> None: """Send a message through the underlying communication channel + :param timeout: maximum time to wait (in seconds) for messages to send :param value: The value to send""" @abstractmethod - def recv(self) -> t.List[bytes]: - """Receieve a message through the underlying communication channel + def recv(self, timeout: float = 0) -> t.List[bytes]: + """Receives message(s) through the underlying communication channel + :param timeout: maximum time to wait (in seconds) for messages to arrive :returns: the received message""" @property def descriptor(self) -> bytes: """Return the channel descriptor for the underlying dragon channel""" if isinstance(self._descriptor, str): - return self._descriptor.encode("utf-8") + return base64.b64decode(self._descriptor.encode("utf-8")) return self._descriptor diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index 89b90f2e62..e902ddadde 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -28,47 +28,142 @@ import sys import typing as t +import dragon.channels as dch +import dragon.infrastructure.facts as df +import dragon.infrastructure.parameters as dp +import dragon.managed_memory as dm +import dragon.utils as du + import smartsim._core.mli.comm.channel.channel as cch +from smartsim.error.errors import SmartSimError from smartsim.log import get_logger logger = get_logger(__name__) import dragon.channels as dch +DEFAULT_CHANNEL_BUFFER_SIZE = 500 +"""Maximum number of messages that can be buffered. DragonCommChannel will +raise an exception if no clients consume messages before the buffer is filled.""" + + +def create_local(capacity: int = 0) -> dch.Channel: + """Creates a Channel attached to the local memory pool + + :param capacity: the number of events the channel can buffer; uses the default + buffer size `DEFAULT_CHANNEL_BUFFER_SIZE` when not supplied + :returns: the instantiated channel""" + pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd)) + channel: t.Optional[dch.Channel] = None + offset = 0 + + capacity = capacity if capacity > 0 else DEFAULT_CHANNEL_BUFFER_SIZE + + while not channel: + # search for an open channel ID + offset += 1 + cid = df.BASE_USER_MANAGED_CUID + offset + try: + channel = dch.Channel( + mem_pool=pool, + c_uid=cid, + capacity=capacity, + ) + logger.debug( + f"Channel {cid} created in pool {pool.serialize()} w/capacity {capacity}" + ) + except Exception: + if offset < 100: + logger.warning(f"Unable to attach to channnel id {cid}. Retrying...") + else: + logger.error(f"All attempts to attach local channel have failed") + raise + + return channel + class DragonCommChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon channel""" - def __init__(self, key: bytes) -> None: - """Initialize the DragonCommChannel instance""" - super().__init__(key) - self._channel: dch.Channel = dch.Channel.attach(key) + def __init__(self, channel: "dch.Channel") -> None: + """Initialize the DragonCommChannel instance - def send(self, value: bytes) -> None: + :param channel: a channel to use for communications + :param recv_timeout: a default timeout to apply to receive calls""" + serialized_ch = channel.serialize() + descriptor = base64.b64encode(serialized_ch).decode("utf-8") + super().__init__(descriptor) + self._channel = channel + + @property + def channel(self) -> "dch.Channel": + """The underlying communication channel""" + return self._channel + + def send(self, value: bytes, timeout: float = 0.001) -> None: """Send a message throuh the underlying communication channel - :param value: The value to send""" - with self._channel.sendh(timeout=None) as sendh: + + :param value: The value to send + :param timeout: maximum time to wait (in seconds) for messages to send""" + with self._channel.sendh(timeout=timeout) as sendh: sendh.send_bytes(value) + logger.debug(f"DragonCommChannel {self.descriptor!r} sent message") - def recv(self) -> t.List[bytes]: - """Receieve a message through the underlying communication channel + def recv(self, timeout: float = 0.001) -> t.List[bytes]: + """Receives message(s) through the underlying communication channel + :param timeout: maximum time to wait (in seconds) for messages to arrive :returns: the received message""" - with self._channel.recvh(timeout=None) as recvh: - message_bytes: bytes = recvh.recv_bytes(timeout=None) - return [message_bytes] + with self._channel.recvh(timeout=timeout) as recvh: + messages: t.List[bytes] = [] + + try: + message_bytes = recvh.recv_bytes(timeout=timeout) + messages.append(message_bytes) + logger.debug(f"DragonCommChannel {self.descriptor!r} received message") + except dch.ChannelEmpty: + # emptied the queue, ok to swallow this ex + logger.debug(f"DragonCommChannel exhausted: {self.descriptor!r}") + except dch.ChannelRecvTimeout as ex: + logger.debug(f"Timeout exceeded on channel.recv: {self.descriptor!r}") + + return messages + + @property + def descriptor_string(self) -> str: + """Return the channel descriptor for the underlying dragon channel + as a string. Automatically performs base64 encoding to ensure the + string can be used in a call to `from_descriptor`""" + if isinstance(self._descriptor, str): + return self._descriptor + + if isinstance(self._descriptor, bytes): + return base64.b64encode(self._descriptor).decode("utf-8") + + raise ValueError(f"Unable to convert channel descriptor: {self._descriptor}") @classmethod def from_descriptor( cls, - descriptor: str, + descriptor: t.Union[bytes, str], ) -> "DragonCommChannel": """A factory method that creates an instance from a descriptor string - :param descriptor: The descriptor that uniquely identifies the resource + :param descriptor: The descriptor that uniquely identifies the resource. Output + from `descriptor_string` is correctly encoded. :returns: An attached DragonCommChannel""" try: - return DragonCommChannel(base64.b64decode(descriptor)) - except: - logger.error(f"Failed to create dragon comm channel: {descriptor}") - raise + utf8_descriptor: t.Union[str, bytes] = descriptor + if isinstance(descriptor, str): + utf8_descriptor = descriptor.encode("utf-8") + + # todo: ensure the bytes argument and condition are removed + # after refactoring the RPC models + + actual_descriptor = base64.b64decode(utf8_descriptor) + channel = dch.Channel.attach(actual_descriptor) + return DragonCommChannel(channel) + except Exception as ex: + raise SmartSimError( + f"Failed to create dragon comm channel: {descriptor!r}" + ) from ex diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index 130c5cf5eb..a5e5f9f350 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -27,6 +27,10 @@ # isort: off from dragon import fli import dragon.channels as dch +import dragon.infrastructure.facts as df +import dragon.infrastructure.parameters as dp +import dragon.managed_memory as dm +import dragon.utils as du # isort: on @@ -34,6 +38,7 @@ import typing as t import smartsim._core.mli.comm.channel.channel as cch +from smartsim._core.mli.comm.channel.dragon_channel import create_local from smartsim.log import get_logger logger = get_logger(__name__) @@ -42,37 +47,48 @@ class DragonFLIChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon FLI Channel""" - def __init__(self, fli_desc: bytes, sender_supplied: bool = True) -> None: + def __init__( + self, + fli_desc: bytes, + sender_supplied: bool = True, + buffer_size: int = 0, + ) -> None: """Initialize the DragonFLIChannel instance :param fli_desc: the descriptor of the FLI channel to attach :param sender_supplied: flag indicating if the FLI uses sender-supplied streams + :param buffer_size: maximum number of sent messages that can be buffered """ super().__init__(fli_desc) - # todo: do we need memory pool information to construct the channel correctly? self._fli: "fli" = fli.FLInterface.attach(fli_desc) self._channel: t.Optional["dch"] = ( - dch.Channel.make_process_local() if sender_supplied else None + create_local(buffer_size) if sender_supplied else None ) - def send(self, value: bytes) -> None: + def send(self, value: bytes, timeout: float = 0.001) -> None: """Send a message through the underlying communication channel + :param timeout: maximum time to wait (in seconds) for messages to send :param value: The value to send""" with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: - sendh.send_bytes(value) + sendh.send_bytes(value, timeout=timeout) + logger.debug(f"DragonFLIChannel {self.descriptor!r} sent message") - def recv(self) -> t.List[bytes]: - """Receieve a message through the underlying communication channel + def recv(self, timeout: float = 0.001) -> t.List[bytes]: + """Receives message(s) through the underlying communication channel + :param timeout: maximum time to wait (in seconds) for messages to arrive :returns: the received message""" messages = [] eot = False - with self._fli.recvh(timeout=0.001) as recvh: + with self._fli.recvh(timeout=timeout) as recvh: while not eot: try: - message, _ = recvh.recv_bytes(timeout=None) + message, _ = recvh.recv_bytes(timeout=timeout) messages.append(message) + logger.debug( + f"DragonFLIChannel {self.descriptor!r} received message" + ) except fli.FLIEOT: eot = True return messages diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index 513dc5f639..21fd98893d 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -316,10 +316,10 @@ def _on_iteration(self) -> None: conditions are satisfied and cooldown is elapsed. """ try: - self._perf_timer.set_active(True) + self._perf_timer.is_active = True bytes_list: t.List[bytes] = self._incoming_channel.recv() except Exception: - self._perf_timer.set_active(False) + self._perf_timer.is_active = False else: if not bytes_list: exception_handler( @@ -501,4 +501,9 @@ def _can_shutdown(self) -> bool: return False def __del__(self) -> None: - self._mem_pool.destroy() + """Destroy allocated memory resources""" + # pool may be null if a failure occurs prior to successful attach + pool: t.Optional[MemoryPool] = getattr(self, "_mem_pool", None) + + if pool: + pool.destroy() diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py new file mode 100644 index 0000000000..e3ea9f918b --- /dev/null +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -0,0 +1,405 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import enum +import pickle +import time +import typing as t +import uuid +from collections import defaultdict, deque +from dataclasses import dataclass + +# pylint: disable=import-error +# isort: off +import dragon.data.ddict.ddict as dragon_ddict + +# isort: on + +from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( + DragonFeatureStore, +) +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +# todo: did i create an arms race where a developer just grabs the backbone +# and passes it wherever they need a FeatureStore? +class BackboneFeatureStore(DragonFeatureStore): + """A DragonFeatureStore wrapper with utility methods for accessing shared + information stored in the MLI backbone feature store""" + + MLI_NOTIFY_CONSUMERS = "_SMARTSIM_MLI_NOTIFY_CONSUMERS" + + def __init__( + self, storage: "dragon_ddict.DDict", allow_reserved_writes: bool = False + ) -> None: + """Initialize the DragonFeatureStore instance + + :param storage: A distributed dictionary to be used as the underlying + storage mechanism of the feature store""" + super().__init__(storage) + self._enable_reserved_writes = allow_reserved_writes + + @property + def notification_channels(self) -> t.Sequence[str]: + """Retrieve descriptors for all registered MLI notification channels + + :returns: the list of descriptors""" + if "_SMARTSIM_MLI_NOTIFY_CONSUMERS" in self: + stored_consumers = self[self.MLI_NOTIFY_CONSUMERS] + return str(stored_consumers).split(",") + return [] + + @notification_channels.setter + def notification_channels(self, values: t.Sequence[str]) -> None: + """Set the notification channels to be sent events + + :param values: the list of channel descriptors to save""" + self[self.MLI_NOTIFY_CONSUMERS] = ",".join([str(value) for value in values]) + + +class EventCategory(str, enum.Enum): + """Predefined event types raised by SmartSim backend""" + + CONSUMER_CREATED: str = "consumer-created" + FEATURE_STORE_WRITTEN: str = "feature-store-written" + + +@dataclass +class EventBase: + """Core API for an event""" + + # todo: shift eventing code to: infrastructure / event / event.py + category: EventCategory + """The event category for this event; may be used for addressing, + prioritization, or filtering of events by a event publisher/consumer""" + + uid: str + """A unique identifier for this event""" + + def __bytes__(self) -> bytes: + """Default conversion to bytes for an event required to publish + messages using byte-oriented communication channels + + :returns: this entity encoded as bytes""" + return pickle.dumps(self) + + def __str__(self) -> str: + """Convert the event to a string + + :returns: a string representation of this instance""" + return f"{self.uid}|{self.category}" + + +class OnCreateConsumer(EventBase): + """Publish this event when a new event consumer registration is required""" + + descriptor: str + """Descriptor of the comm channel exposed by the consumer""" + + def __init__(self, descriptor: str) -> None: + """Initialize the event + + :param descriptor: descriptor of the comm channel exposed by the consumer + """ + super().__init__(EventCategory.CONSUMER_CREATED, str(uuid.uuid4())) + self.descriptor = descriptor + + def __str__(self) -> str: + """Convert the event to a string + + :returns: a string representation of this instance""" + return f"{str(super())}|{self.descriptor}" + + +class OnWriteFeatureStore(EventBase): + """Publish this event when a feature store key is written""" + + descriptor: str + """The descriptor of the feature store where the write occurred""" + + key: str + """The key identifying where the write occurred""" + + def __init__(self, descriptor: str, key: str) -> None: + """Initialize the event + + :param descriptor: The descriptor of the feature store where the write occurred + :param key: The key identifying where the write occurred + """ + super().__init__(EventCategory.FEATURE_STORE_WRITTEN, str(uuid.uuid4())) + self.descriptor = descriptor + self.key = key + + def __str__(self) -> str: + """Convert the event to a string + + :returns: a string representation of this instance""" + return f"{str(super())}|{self.descriptor}|{self.key}" + + +class EventProducer(t.Protocol): + """Core API of a class that publishes events""" + + def send(self, event: EventBase, timeout: float = 0.001) -> int: + """The send operation + + :param event: the event to send + :param timeout: maximum time to wait (in seconds) for messages to send""" + + +class EventBroadcaster: + """Performs fan-out publishing of system events""" + + def __init__( + self, + backbone: BackboneFeatureStore, + channel_factory: t.Optional[t.Callable[[str], CommChannelBase]] = None, + ) -> None: + """Initialize the EventPublisher instance + + :param backbone: the MLI backbone feature store + :param channel_factory: factory method to construct new channel instances + """ + self._backbone = backbone + """The backbone feature store used to retrieve consumer descriptors""" + self._channel_factory = channel_factory + """A factory method used to instantiate channels from descriptors""" + self._channel_cache: t.Dict[str, t.Optional[CommChannelBase]] = defaultdict( + lambda: None + ) + """A mapping of instantiated channels that can be re-used. Automatically + calls the channel factory if a descriptor is not already in the collection""" + self._event_buffer: t.Deque[bytes] = deque() + """A buffer for storing events when a consumer list is not found.""" + self._descriptors: t.Set[str] + """Stores the most recent list of broadcast consumers. Updated automatically + on each broadcast""" + self._uid = str(uuid.uuid4()) + """A unique identifer assigned to the broadcaster for logging""" + + @property + def num_buffered(self) -> int: + """Return the number of events currently buffered to send""" + return len(self._event_buffer) + + def _save_to_buffer(self, event: EventBase) -> None: + """Places a serialized event in the buffer to be sent once a consumer + list is available. + + :param event: The event to serialize and buffer""" + + try: + event_bytes = bytes(event) + self._event_buffer.append(event_bytes) + except Exception as ex: + raise ValueError(f"Unable to serialize event from {self._uid}") from ex + + def _log_broadcast_start(self) -> None: + """Logs broadcast statistics""" + num_events = len(self._event_buffer) + num_copies = len(self._descriptors) + logger.debug( + f"Broadcast {num_events} events to {num_copies} consumers from {self._uid}" + ) + + def _prune_unused_consumers(self) -> None: + """Performs maintenance on the channel cache by pruning any channel + that has been removed from the consumers list""" + active_consumers = set(self._descriptors) + current_channels = set(self._channel_cache.keys()) + + # find any cached channels that are now unused + inactive_channels = current_channels.difference(active_consumers) + new_channels = active_consumers.difference(current_channels) + + for descriptor in inactive_channels: + self._channel_cache.pop(descriptor) + + logger.debug( + f"Pruning {len(inactive_channels)} stale consumers and" + f" found {len(new_channels)} new channels for {self._uid}" + ) + + def _get_comm_channel(self, descriptor: str) -> CommChannelBase: + """Helper method to build and cache a comm channel + + :param descriptor: the descriptor to pass to the channel factory + :returns: the instantiated channel + :raises SmartSimError: if the channel fails to build""" + comm_channel = self._channel_cache[descriptor] + if comm_channel is not None: + return comm_channel + + if self._channel_factory is None: + raise SmartSimError("No channel factory provided for consumers") + + try: + channel = self._channel_factory(descriptor) + self._channel_cache[descriptor] = channel + return channel + except Exception as ex: + msg = f"Unable to construct channel with descriptor: {descriptor}" + logger.error(msg, exc_info=True) + raise SmartSimError(msg) from ex + + def _broadcast(self, timeout: float = 0.001) -> int: + """Broadcasts all buffered events to registered event consumers. + + :param timeout: maximum time to wait (in seconds) for messages to send + :return: the number of events broadcasted to consumers + :raises ValueError: if event serialization fails + :raises KeyError: if channel fails to attach using registered descriptors + :raises SmartSimError: if broadcasting fails""" + + # allow descriptors to be empty since events are buffered + self._descriptors = set(x for x in self._backbone.notification_channels if x) + if not self._descriptors: + logger.warning(f"No event consumers are registered for {self._uid}") + return 0 + + self._prune_unused_consumers() + self._log_broadcast_start() + + num_sent: int = 0 + next_event: t.Optional[bytes] = self._event_buffer.popleft() + + # send each event to every consumer + while next_event is not None: + for descriptor in map(str, self._descriptors): + comm_channel = self._get_comm_channel(descriptor) + + try: + # todo: given a failure, the message is not sent to any other + # recipients. consider retrying, adding a dead letter queue, or + # logging the message details more intentionally + comm_channel.send(next_event, timeout) + num_sent += 1 + except Exception as ex: + raise SmartSimError( + f"Failed broadcast to channel {descriptor} from {self._uid}" + ) from ex + + try: + next_event = self._event_buffer.popleft() + except IndexError: + next_event = None + logger.debug(f"Broadcast buffer exhausted for {self._uid}") + + return num_sent + + def send(self, event: EventBase, timeout: float = 0.001) -> int: + """Implementation of `send` method of the `EventPublisher` protocol. Publishes + the supplied event to all registered broadcast consumers + + :param event: an event to publish + :param timeout: maximum time to wait (in seconds) for messages to send + :returns: the number of events successfully published + :raises ValueError: if event serialization fails + :raises KeyError: if channel fails to attach using registered descriptors + :raises SmartSimError: if any unexpected error occurs during send""" + try: + self._save_to_buffer(event) + return self._broadcast(timeout) + except (KeyError, ValueError, SmartSimError): + raise + except Exception as ex: + raise SmartSimError("An unexpected failure occurred while sending") from ex + + +class EventConsumer: + """Reads system events published to a communications channel""" + + def __init__( + self, + comm_channel: CommChannelBase, + backbone: BackboneFeatureStore, + filters: t.Optional[t.List[EventCategory]] = None, + batch_timeout: t.Optional[float] = None, + ) -> None: + """Initialize the EventConsumer instance + + :param comm_channel: communications channel to listen to for events + :param backbone: the MLI backbone feature store + :param filters: a list of event types to deliver. when empty, all + events will be delivered + :param timeout: maximum time to wait for messages to arrive; may be overridden + on individual calls to `receive`""" + if batch_timeout is not None and batch_timeout <= 0: + raise ValueError("batch_timeout must be a non-zero, positive value") + + self._comm_channel = comm_channel + self._backbone = backbone + self._global_filters = filters or [] + self._global_timeout = batch_timeout or 1.0 + + def receive( + self, filters: t.Optional[t.List[EventCategory]] = None, timeout: float = 0 + ) -> t.List[EventBase]: + """Receives available published event(s) + + :param filters: additional filters to add to the global filters configured + on the EventConsumer instance + :param timeout: maximum time to wait for messages to arrive + :returns: a list of events that pass any configured filters""" + if filters is None: + filters = [] + + filter_set = {*self._global_filters, *filters} + messages: t.List[t.Any] = [] + + # use the local timeout to override a global setting + start_at = time.time_ns() + + while msg_bytes_list := self._comm_channel.recv(timeout=timeout): + # remove any empty messages that will fail to decode + msg_bytes_list = [msg for msg in msg_bytes_list if msg] + + msg: t.Optional[EventBase] = None + if msg_bytes_list: + for message in msg_bytes_list: + msg = pickle.loads(message) + + if not msg: + logger.warning("Unable to unpickle message") + continue + + # ignore anything that doesn't match a filter (if one is + # supplied), otherwise return everything + if not filter_set or msg.category in filter_set: + messages.append(msg) + + # avoid getting stuck indefinitely waiting for the channel + elapsed = (time.time_ns() - start_at) / 1000000000 + remaining = elapsed - self._global_timeout + if remaining > 0: + logger.debug(f"consumer batch timeout exceeded by: {abs(remaining)}") + break + + return messages diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py index aee4aac529..c322c34e2c 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py @@ -47,48 +47,38 @@ def __init__(self, storage: "dragon_ddict.DDict") -> None: :param storage: A distributed dictionary to be used as the underlying storage mechanism of the feature store""" - self._storage = storage + if isinstance(storage, dragon_ddict.DDict): + descriptor = str(storage.serialize()) + else: + descriptor = "not-set" - def __getitem__(self, key: str) -> t.Union[str, bytes]: - """Retrieve an item using key + super().__init__(descriptor) + self._storage: t.Dict[str, t.Union[str, bytes]] = storage - :param key: Unique key of an item to retrieve from the feature store - :returns: The value identified by the supplied key - :raises KeyError: if the key is not found in the feature store - :raises SmartSimError: if retrieval from the feature store fails""" - try: - value: t.Union[str, bytes] = self._storage[key] - return value - except KeyError: - logger.warning(f"An unknown key was requested: {key}") - raise - except Exception as ex: - # note: explicitly avoid round-trip to check for key existence - raise SmartSimError( - f"Could not get value for existing key {key}, error:\n{ex}" - ) from ex + def _get(self, key: str) -> t.Union[str, bytes]: + """Retrieve a value from the underlying storage mechanism - def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None: - """Assign a value using key + :param key: The unique key that identifies the resource + :returns: the value identified by the key + :raises KeyError: if the key has not been used to store a value""" + return self._storage[key] - :param key: Unique key of an item to set in the feature store - :param value: Value to persist in the feature store""" + def _set(self, key: str, value: t.Union[str, bytes]) -> None: + """Store a value into the underlying storage mechanism + + :param key: The unique key that identifies the resource + :param value: The value to store + :returns: the value identified by the key + :raises KeyError: if the key has not been used to store a value""" self._storage[key] = value - def __contains__(self, key: str) -> bool: - """Membership operator to test for a key existing within the feature store. + def _contains(self, key: str) -> bool: + """Determine if the storage mechanism contains a given key - :param key: Unique key of an item to retrieve from the feature store - :returns: `True` if the key is found, `False` otherwise""" + :param key: The unique key that identifies the resource + :returns: True if the key is defined, False otherwise""" return key in self._storage - @property - def descriptor(self) -> str: - """A unique identifier enabling a client to connect to the feature store - - :returns: A descriptor encoded as a string""" - return str(self._storage.serialize()) - @classmethod def from_descriptor( cls, diff --git a/smartsim/_core/mli/infrastructure/storage/feature_store.py b/smartsim/_core/mli/infrastructure/storage/feature_store.py index 31e3866e70..04e7134427 100644 --- a/smartsim/_core/mli/infrastructure/storage/feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/feature_store.py @@ -24,15 +24,39 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import enum import typing as t from abc import ABC, abstractmethod from dataclasses import dataclass +from smartsim.error.errors import SmartSimError from smartsim.log import get_logger logger = get_logger(__name__) +class ReservedKeys(str, enum.Enum): + """Contains constants used to identify all featurestore keys that + may not be to used by users. Avoids overwriting system data""" + + MLI_NOTIFY_CONSUMERS = "_SMARTSIM_MLI_NOTIFY_CONSUMERS" + """Storage location for the list of registered consumers that will receive + events from an EventBroadcaster""" + + @classmethod + def contains(cls, value: str) -> bool: + """Convert a string representation into an enumeration member + + :param value: the string to convert + :returns: the enumeration member if the conversion succeeded, otherwise None""" + try: + cls(value) + except ValueError: + return False + + return True + + @dataclass(frozen=True) class FeatureStoreKey: """A key,descriptor pair enabling retrieval of an item from a feature store""" @@ -57,29 +81,104 @@ class FeatureStore(ABC): """Abstract base class providing the common interface for retrieving values from a feature store implementation""" - @abstractmethod + def __init__(self, descriptor: str, allow_reserved_writes: bool = False) -> None: + """Initialize the feature store + + :param descriptor: the stringified version of a storage descriptor + :param allow_reserved_writes: override the default behavior of blocking + writes to reserved keys""" + self._enable_reserved_writes = allow_reserved_writes + """Flag used to ensure that any keys written by the system to a feature store + are not overwritten by user code. Disabled by default. Subclasses must set the + value intentionally.""" + self._descriptor = descriptor + """Stringified version of the unique ID enabling a client to connect + to the feature store""" + + def _check_reserved(self, key: str) -> None: + """A utility method used to verify access to write to a reserved key + in the FeatureStore. Used by subclasses in __setitem___ implementations + + :param key: a key to compare to the reserved keys + :raises SmartSimError: if the key is reserved""" + if not self._enable_reserved_writes and ReservedKeys.contains(key): + raise SmartSimError( + "Use of reserved key denied. " + "Unable to overwrite system configuration" + ) + def __getitem__(self, key: str) -> t.Union[str, bytes]: """Retrieve an item using key :param key: Unique key of an item to retrieve from the feature store""" + try: + return self._get(key) + except KeyError as ex: + raise SmartSimError(f"An unknown key was requested: {key}") from ex + except Exception as ex: + # note: explicitly avoid round-trip to check for key existence + raise SmartSimError( + f"Could not get value for existing key {key}, error:\n{ex}" + ) from ex - @abstractmethod def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None: """Assign a value using key :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" + self._check_reserved(key) + self._set(key, value) - @abstractmethod def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. :param key: Unique key of an item to retrieve from the feature store :returns: `True` if the key is found, `False` otherwise""" + return self._contains(key) - @property @abstractmethod + def _get(self, key: str) -> t.Union[str, bytes]: + """Retrieve a value from the underlying storage mechanism + + :param key: The unique key that identifies the resource + :returns: the value identified by the key + :raises KeyError: if the key has not been used to store a value""" + + @abstractmethod + def _set(self, key: str, value: t.Union[str, bytes]) -> None: + """Store a value into the underlying storage mechanism + + :param key: The unique key that identifies the resource + :param value: The value to store + :returns: the value identified by the key + :raises KeyError: if the key has not been used to store a value""" + + @abstractmethod + def _contains(self, key: str) -> bool: + """Determine if the storage mechanism contains a given key + + :param key: The unique key that identifies the resource + :returns: `True` if the key is defined, `False` otherwise""" + + @property + def _allow_reserved_writes(self) -> bool: + """Return the boolean flag indicating if writing to reserved keys is + enabled for this feature store + + :returns: `True` if enabled, `False` otherwise""" + return self._enable_reserved_writes + + @_allow_reserved_writes.setter + def _allow_reserved_writes(self, value: bool) -> None: + """Modify the boolean flag indicating if writing to reserved keys is + enabled for this feature store + + :param value: the new value to set for the flag""" + self._enable_reserved_writes = value + + @property def descriptor(self) -> str: """Unique identifier enabling a client to connect to the feature store :returns: A descriptor encoded as a string""" + return self._descriptor diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py index a61a243220..114db88d90 100644 --- a/smartsim/_core/utils/timings.py +++ b/smartsim/_core/utils/timings.py @@ -57,6 +57,7 @@ def _add_label_to_timings(self, label: str) -> None: @staticmethod def _format_number(number: t.Union[float, int]) -> str: + """Formats the input value with a fixed precision appropriate for logging""" return f"{number:0.4e}" def start_timings( @@ -64,6 +65,12 @@ def start_timings( first_label: t.Optional[str] = None, first_value: t.Optional[t.Union[float, int]] = None, ) -> None: + """Start a recording session by recording + + :param first_label: a label for an event that will be manually prepended + to the timing information before starting timers + :param first_label: a value for an event that will be manually prepended + to the timing information before starting timers""" if self._timing_on: if first_label is not None and first_value is not None: mod_label = self._make_label(first_label) @@ -75,6 +82,7 @@ def start_timings( self._interm = time.perf_counter() def end_timings(self) -> None: + """Record a timing event and clear the last checkpoint""" if self._timing_on and self._start is not None: mod_label = self._make_label("total_time") self._add_label_to_timings(mod_label) @@ -84,14 +92,24 @@ def end_timings(self) -> None: self._interm = None def _make_label(self, label: str) -> str: + """Return a label formatted with the current label prefix + + :param label: the original label + :returns: the adjusted label value""" return self._prefix + label - def _get_delta(self) -> t.Union[float, int]: + def _get_delta(self) -> float: + """Calculates the offset from the last intermediate checkpoint time + + :returns: the number of seconds elapsed""" if self._interm is None: return 0 return time.perf_counter() - self._interm def get_last(self, label: str) -> str: + """Return the last timing value collected for the given label in + the format `{label}: {value}`. If no timing value has been collected + with the label, returns `Not measured yet`""" mod_label = self._make_label(label) if mod_label in self._timings: value = self._timings[mod_label][-1] @@ -101,6 +119,9 @@ def get_last(self, label: str) -> str: return "Not measured yet" def measure_time(self, label: str) -> None: + """Record a new time event if timing is enabled + + :param label: the label to record a timing event for""" if self._timing_on and self._interm is not None: mod_label = self._make_label(label) self._add_label_to_timings(mod_label) @@ -110,16 +131,24 @@ def measure_time(self, label: str) -> None: self._interm = time.perf_counter() def _log(self, msg: str) -> None: + """Conditionally logs a message when the debug flag is enabled + + :param msg: the message to be logged""" if self._debug: logger.info(msg) @property def max_length(self) -> int: + """Returns the number of records contained in the largest timing set""" if len(self._timings) == 0: return 0 return max(len(value) for value in self._timings.values()) def print_timings(self, to_file: bool = False) -> None: + """Print all timing information + + :param to_file: flag indicating if timing should be written to stdout + or to the timing file""" print(" ".join(self._timings.keys())) try: value_array = np.array(list(self._timings.values()), dtype=float) @@ -133,11 +162,12 @@ def print_timings(self, to_file: bool = False) -> None: if to_file: np.save(self._prefix + self._filename + ".npy", value_array) - def set_active(self, active: bool = True) -> None: - """Set whether the timer will record time""" - self._timing_on = active - @property def is_active(self) -> bool: - """Returns true if the timer will record time""" + """Return `True` if timer is recording, `False` otherwise""" return self._timing_on + + @is_active.setter + def is_active(self, active: bool) -> None: + """Set to `True` to record timing information, `False` otherwise""" + self._timing_on = active diff --git a/tests/dragon/feature_store.py b/tests/dragon/feature_store.py index 178b675e64..d06b0b334e 100644 --- a/tests/dragon/feature_store.py +++ b/tests/dragon/feature_store.py @@ -37,76 +37,81 @@ class MemoryFeatureStore(FeatureStore): """A feature store with values persisted only in local memory""" - def __init__(self) -> None: + def __init__( + self, storage: t.Optional[t.Dict[str, t.Union[str, bytes]]] = None + ) -> None: """Initialize the MemoryFeatureStore instance""" - self._storage: t.Dict[str, bytes] = {} + super().__init__("in-memory-fs") + if storage is None: + storage = {"_": "abc"} + self._storage = storage - def __getitem__(self, key: str) -> bytes: - """Retrieve an item using key + def _get(self, key: str) -> t.Union[str, bytes]: + """Retrieve a value from the underlying storage mechanism - :param key: Unique key of an item to retrieve from the feature store""" - if key not in self._storage: - raise sse.SmartSimError(f"{key} not found in feature store") + :param key: The unique key that identifies the resource + :returns: the value identified by the key + :raises KeyError: if the key has not been used to store a value""" return self._storage[key] - def __setitem__(self, key: str, value: bytes) -> None: - """Membership operator to test for a key existing within the feature store. + def _set(self, key: str, value: t.Union[str, bytes]) -> None: + """Store a value into the underlying storage mechanism - :param key: Unique key of an item to retrieve from the feature store - :returns: `True` if the key is found, `False` otherwise""" + :param key: The unique key that identifies the resource + :param value: The value to store + :returns: the value identified by the key + :raises KeyError: if the key has not been used to store a value""" self._storage[key] = value - def __contains__(self, key: str) -> bool: - """Membership operator to test for a key existing within the feature store. + def _contains(self, key: str) -> bool: + """Determine if the storage mechanism contains a given key - :param key: Unique key of an item to retrieve from the feature store - :returns: `True` if the key is found, `False` otherwise""" + :param key: The unique key that identifies the resource + :returns: True if the key is defined, False otherwise""" return key in self._storage - @property - def descriptor(self) -> str: - """Unique identifier enabling a client to connect to the feature store - - :returns: A descriptor encoded as a string""" - return "file-system-fs" - class FileSystemFeatureStore(FeatureStore): """Alternative feature store implementation for testing. Stores all data on the file system""" - def __init__( - self, storage_dir: t.Optional[t.Union[pathlib.Path, str]] = None - ) -> None: + def __init__(self, storage_dir: t.Union[pathlib.Path, str]) -> None: """Initialize the FileSystemFeatureStore instance :param storage_dir: (optional) root directory to store all data relative to""" if isinstance(storage_dir, str): storage_dir = pathlib.Path(storage_dir) self._storage_dir = storage_dir + super().__init__(storage_dir.as_posix()) - def __getitem__(self, key: str) -> bytes: - """Retrieve an item using key + def _get(self, key: str) -> t.Union[str, bytes]: + """Retrieve a value from the underlying storage mechanism - :param key: Unique key of an item to retrieve from the feature store""" + :param key: The unique key that identifies the resource + :returns: the value identified by the key + :raises KeyError: if the key has not been used to store a value""" path = self._key_path(key) if not path.exists(): raise sse.SmartSimError(f"{path} not found in feature store") return path.read_bytes() - def __setitem__(self, key: str, value: bytes) -> None: - """Assign a value using key + def _set(self, key: str, value: t.Union[str, bytes]) -> None: + """Store a value into the underlying storage mechanism - :param key: Unique key of an item to set in the feature store - :param value: Value to persist in the feature store""" + :param key: The unique key that identifies the resource + :param value: The value to store + :returns: the value identified by the key + :raises KeyError: if the key has not been used to store a value""" path = self._key_path(key, create=True) + if isinstance(value, str): + value = value.encode("utf-8") path.write_bytes(value) - def __contains__(self, key: str) -> bool: - """Membership operator to test for a key existing within the feature store. + def _contains(self, key: str) -> bool: + """Determine if the storage mechanism contains a given key - :param key: Unique key of an item to retrieve from the feature store - :returns: `True` if the key is found, `False` otherwise""" + :param key: The unique key that identifies the resource + :returns: True if the key is defined, False otherwise""" path = self._key_path(key) return path.exists() @@ -117,7 +122,7 @@ def _key_path(self, key: str, create: bool = False) -> pathlib.Path: :param key: Unique key of an item to retrieve from the feature store""" value = pathlib.Path(key) - if self._storage_dir: + if self._storage_dir is not None: value = self._storage_dir / key if create: @@ -125,15 +130,6 @@ def _key_path(self, key: str, create: bool = False) -> pathlib.Path: return value - @property - def descriptor(self) -> str: - """Unique identifier enabling a client to connect to the feature store - - :returns: A descriptor encoded as a string""" - if not self._storage_dir: - raise ValueError("No storage path configured") - return self._storage_dir.as_posix() - @classmethod def from_descriptor( cls, diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py new file mode 100644 index 0000000000..3c10319f81 --- /dev/null +++ b/tests/dragon/test_featurestore_base.py @@ -0,0 +1,722 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pathlib +import typing as t + +import pytest + +dragon = pytest.importorskip("dragon") + +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, + EventBroadcaster, + EventCategory, + EventConsumer, + OnCreateConsumer, + OnWriteFeatureStore, +) +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( + DragonFeatureStore, +) +from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys +from smartsim.error import SmartSimError +from tests.mli.channel import FileSystemCommChannel +from tests.mli.feature_store import MemoryFeatureStore + +if t.TYPE_CHECKING: + import conftest + + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + + +def test_event_uid() -> None: + """Verify that all events include a unique identifier""" + uids: t.Set[str] = set() + num_iters = 1000 + + # generate a bunch of events and keep track all the IDs + for i in range(num_iters): + event_a = OnCreateConsumer(str(i)) + event_b = OnWriteFeatureStore(str(i), "key") + + uids.add(event_a.uid) + uids.add(event_b.uid) + + # verify each event created a unique ID + assert len(uids) == 2 * num_iters + + +def test_mli_reserved_keys_conversion() -> None: + """Verify that conversion from a string to an enum member + works as expected""" + + for reserved_key in ReservedKeys: + # iterate through all keys and verify `from_string` works + assert ReservedKeys.contains(reserved_key.value) + + # show that the value (actual key) not the enum member name + # will not be incorrectly identified as reserved + assert not ReservedKeys.contains(str(reserved_key).split(".")[1]) + + +def test_mli_reserved_keys_writes() -> None: + """Verify that attempts to write to reserved keys are blocked from a + standard DragonFeatureStore but enabled with the BackboneFeatureStore""" + + mock_storage = {} + dfs = DragonFeatureStore(mock_storage) + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + other = MemoryFeatureStore(mock_storage) + + expected_value = "value" + + for reserved_key in ReservedKeys: + # we expect every reserved key to fail using DragonFeatureStore... + with pytest.raises(SmartSimError) as ex: + dfs[reserved_key] = expected_value + + assert "reserved key" in ex.value.args[0] + + # ... and expect other feature stores to respect reserved keys + with pytest.raises(SmartSimError) as ex: + other[reserved_key] = expected_value + + assert "reserved key" in ex.value.args[0] + + # ...and those same keys to succeed on the backbone + backbone[reserved_key] = expected_value + actual_value = backbone[reserved_key] + assert actual_value == expected_value + + +def test_mli_consumers_read_by_key() -> None: + """Verify that the value returned from the mli consumers + method is written to the correct key and reads are + allowed via standard dragon feature store. + NOTE: should reserved reads also be blocked""" + + mock_storage = {} + dfs = DragonFeatureStore(mock_storage) + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + other = MemoryFeatureStore(mock_storage) + + expected_value = "value" + + # write using backbone that has permission to write reserved keys + backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] = expected_value + + # confirm read-only access to reserved keys from any FeatureStore + for fs in [dfs, backbone, other]: + assert fs[ReservedKeys.MLI_NOTIFY_CONSUMERS] == expected_value + + +def test_mli_consumers_read_by_backbone() -> None: + """Verify that the backbone reads the correct location + when using the backbone feature store API instead of mapping API""" + + mock_storage = {} + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + expected_value = "value" + + backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] = expected_value + + # confirm reading via convenience method returns expected value + assert backbone.notification_channels[0] == expected_value + + +def test_mli_consumers_write_by_backbone() -> None: + """Verify that the backbone writes the correct location + when using the backbone feature store API instead of mapping API""" + + mock_storage = {} + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + expected_value = ["value"] + + backbone.notification_channels = expected_value + + # confirm write using convenience method targets expected key + assert backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] == ",".join(expected_value) + + +def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None: + """Verify that a broadcast operation without any registered subscribers + succeeds without raising Exceptions + + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs""" + storage_path = pathlib.Path(test_dir) / "features" + mock_storage = {} + consumer_descriptor = storage_path / "test-consumer" + + # NOTE: we're not putting any consumers into the backbone here! + backbone = BackboneFeatureStore(mock_storage) + + event = OnCreateConsumer(consumer_descriptor) + + publisher = EventBroadcaster(backbone) + num_receivers = 0 + + # publishing this event without any known consumers registered should succeed + # but report that it didn't have anybody to send the event to + consumer_descriptor = storage_path / f"test-consumer" + event = OnCreateConsumer(consumer_descriptor) + + num_receivers += publisher.send(event) + + # confirm no changes to the backbone occur when fetching the empty consumer key + key_in_features_store = ReservedKeys.MLI_NOTIFY_CONSUMERS in backbone + assert not key_in_features_store + + # confirm that the broadcast reports no events published + assert num_receivers == 0 + # confirm that the broadcast buffered the event for a later send + assert publisher.num_buffered == 1 + + +def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None: + """Verify that a broadcast operation without any registered subscribers + succeeds without raising Exceptions + + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs""" + storage_path = pathlib.Path(test_dir) / "features" + mock_storage = {} + + # note: file-system descriptors are just paths + consumer_descriptor = storage_path / "test-consumer" + + # prep our backbone with a consumer list + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + backbone.notification_channels = [] + + event = OnCreateConsumer(consumer_descriptor) + publisher = EventBroadcaster( + backbone, channel_factory=FileSystemCommChannel.from_descriptor + ) + num_receivers = publisher.send(event) + + registered_consumers = backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] + + # confirm that no consumers exist in backbone to send to + assert not registered_consumers + # confirm that the broadcast reports no events published + assert num_receivers == 0 + # confirm that the broadcast buffered the event for a later send + assert publisher.num_buffered == 1 + + +def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None: + """Verify that a broadcast operation reports an error if no channel + factory was supplied for constructing the consumer channels + + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs""" + storage_path = pathlib.Path(test_dir) / "features" + mock_storage = {} + + # note: file-system descriptors are just paths + consumer_descriptor = storage_path / "test-consumer" + + # prep our backbone with a consumer list + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + backbone.notification_channels = [consumer_descriptor] + + event = OnCreateConsumer(consumer_descriptor) + publisher = EventBroadcaster( + backbone, + # channel_factory=FileSystemCommChannel.from_descriptor # <--- not supplied + ) + + with pytest.raises(SmartSimError) as ex: + publisher.send(event) + + assert "factory" in ex.value.args[0] + + +def test_eventpublisher_broadcast_empties_buffer(test_dir: str) -> None: + """Verify that a successful broadcast clears messages from the event + buffer when a new message is sent and consumers are registered + + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs""" + storage_path = pathlib.Path(test_dir) / "features" + mock_storage = {} + + # note: file-system descriptors are just paths + consumer_descriptor = storage_path / "test-consumer" + + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + backbone.notification_channels = (consumer_descriptor,) + + publisher = EventBroadcaster( + backbone, channel_factory=FileSystemCommChannel.from_descriptor + ) + + # mock building up some buffered events + num_buffered_events = 14 + for i in range(num_buffered_events): + event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}") + publisher._event_buffer.append(bytes(event)) + + event0 = OnCreateConsumer( + storage_path / f"test-consumer-{str(num_buffered_events + 1)}" + ) + + num_receivers = publisher.send(event0) + # 1 receiver x 15 total events == 15 events + assert num_receivers == num_buffered_events + 1 + + +@pytest.mark.parametrize( + "num_consumers, num_buffered, expected_num_sent", + [ + pytest.param(0, 7, 0, id="0 x (7+1) - no consumers, multi-buffer"), + pytest.param(1, 7, 8, id="1 x (7+1) - single consumer, multi-buffer"), + pytest.param(2, 7, 16, id="2 x (7+1) - multi-consumer, multi-buffer"), + pytest.param(4, 4, 20, id="4 x (4+1) - multi-consumer, multi-buffer (odd #)"), + pytest.param(9, 0, 9, id="13 x (0+1) - multi-consumer, empty buffer"), + ], +) +def test_eventpublisher_broadcast_returns_total_sent( + test_dir: str, num_consumers: int, num_buffered: int, expected_num_sent: int +) -> None: + """Verify that a successful broadcast returns the total number of events + sent, including buffered messages. + + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs + :param num_consumers: the number of consumers to mock setting up prior to send + :param num_buffered: the number of pre-buffered events to mock up + :param expected_num_sent: the expected result from calling send + """ + storage_path = pathlib.Path(test_dir) / "features" + mock_storage = {} + + # note: file-system descriptors are just paths + consumers = [] + for i in range(num_consumers): + consumers.append(storage_path / f"test-consumer-{i}") + + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + backbone.notification_channels = consumers + + publisher = EventBroadcaster( + backbone, channel_factory=FileSystemCommChannel.from_descriptor + ) + + # mock building up some buffered events + for i in range(num_buffered): + event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}") + publisher._event_buffer.append(bytes(event)) + + assert publisher.num_buffered == num_buffered + + # this event will trigger clearing anything already in buffer + event0 = OnCreateConsumer(storage_path / f"test-consumer-{num_buffered}") + + # num_receivers should contain a number that computes w/all consumers and all events + num_receivers = publisher.send(event0) + + assert num_receivers == expected_num_sent + + +def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None: + """Verify that any unused consumers are pruned each time a new event is sent + + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs""" + storage_path = pathlib.Path(test_dir) / "features" + mock_storage = {} + + # note: file-system descriptors are just paths + consumer_descriptor = storage_path / "test-consumer" + + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + + publisher = EventBroadcaster( + backbone, channel_factory=FileSystemCommChannel.from_descriptor + ) + + event = OnCreateConsumer(consumer_descriptor) + + # the only registered cnosumer is in the event, expect no pruning + backbone.notification_channels = (consumer_descriptor,) + + publisher.send(event) + assert str(consumer_descriptor) in publisher._channel_cache + assert len(publisher._channel_cache) == 1 + + # add a new descriptor for another event... + consumer_descriptor2 = storage_path / "test-consumer-2" + # ... and remove the old descriptor from the backbone when it's looked up + backbone.notification_channels = (consumer_descriptor2,) + + event = OnCreateConsumer(consumer_descriptor2) + + publisher.send(event) + + assert str(consumer_descriptor2) in publisher._channel_cache + assert str(consumer_descriptor) not in publisher._channel_cache + assert len(publisher._channel_cache) == 1 + + # test multi-consumer pruning by caching some extra channels + prune0, prune1, prune2 = "abc", "def", "ghi" + publisher._channel_cache[prune0] = "doesnt-matter-if-it-is-pruned" + publisher._channel_cache[prune1] = "doesnt-matter-if-it-is-pruned" + publisher._channel_cache[prune2] = "doesnt-matter-if-it-is-pruned" + + # add in one of our old channels so we prune the above items, send to these + backbone.notification_channels = (consumer_descriptor, consumer_descriptor2) + + publisher.send(event) + + assert str(consumer_descriptor2) in publisher._channel_cache + + # NOTE: we should NOT prune something that isn't used by this message but + # does appear in `backbone.notification_channels` + assert str(consumer_descriptor) in publisher._channel_cache + + # confirm all of our items that were not in the notification channels are gone + for pruned in [prune0, prune1, prune2]: + assert pruned not in publisher._channel_cache + + # confirm we have only the two expected items in the channel cache + assert len(publisher._channel_cache) == 2 + + +def test_eventpublisher_serialize_failure( + test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Verify that errors during message serialization are raised to the caller + + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs + :param monkeypatch: pytest fixture for modifying behavior of existing code + with mock implementations""" + storage_path = pathlib.Path(test_dir) / "features" + storage_path.mkdir(parents=True, exist_ok=True) + + mock_storage = {} + + # note: file-system descriptors are just paths + target_descriptor = str(storage_path / "test-consumer") + + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + publisher = EventBroadcaster( + backbone, channel_factory=FileSystemCommChannel.from_descriptor + ) + + with monkeypatch.context() as patch: + event = OnCreateConsumer(target_descriptor) + + # patch the __bytes__ implementation to cause pickling to fail during send + patch.setattr(event, "__bytes__", lambda x: b"abc") + + backbone.notification_channels = (target_descriptor,) + + # send a message into the channel + with pytest.raises(ValueError) as ex: + publisher.send(event) + + assert "serialize" in ex.value.args[0] + + +def test_eventpublisher_factory_failure( + test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Verify that errors during channel construction are raised to the caller + + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs + :param monkeypatch: pytest fixture for modifying behavior of existing code + with mock implementations""" + storage_path = pathlib.Path(test_dir) / "features" + storage_path.mkdir(parents=True, exist_ok=True) + + mock_storage = {} + + # note: file-system descriptors are just paths + target_descriptor = str(storage_path / "test-consumer") + + def boom(descriptor: str) -> None: + raise Exception(f"you shall not pass! {descriptor}") + + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + publisher = EventBroadcaster(backbone, channel_factory=boom) + + with monkeypatch.context() as patch: + event = OnCreateConsumer(target_descriptor) + + backbone.notification_channels = (target_descriptor,) + + # send a message into the channel + with pytest.raises(SmartSimError) as ex: + publisher.send(event) + + assert "construct" in ex.value.args[0] + + +def test_eventpublisher_failure(test_dir: str, monkeypatch: pytest.MonkeyPatch) -> None: + """Verify that unexpected errors during message send are caught and wrapped in a + SmartSimError so they are not propagated directly to the caller + + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs + :param monkeypatch: pytest fixture for modifying behavior of existing code + with mock implementations""" + storage_path = pathlib.Path(test_dir) / "features" + storage_path.mkdir(parents=True, exist_ok=True) + + mock_storage = {} + + # note: file-system descriptors are just paths + target_descriptor = str(storage_path / "test-consumer") + + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + publisher = EventBroadcaster( + backbone, channel_factory=FileSystemCommChannel.from_descriptor + ) + + def boom(self) -> None: + raise Exception("That was unexpected...") + + with monkeypatch.context() as patch: + event = OnCreateConsumer(target_descriptor) + + # patch the _broadcast implementation to cause send to fail after + # after the event has been pickled + patch.setattr(publisher, "_broadcast", boom) + + backbone.notification_channels = (target_descriptor,) + + # Here, we see the exception raised by broadcast that isn't expected + # is not allowed directly out, and instead is wrapped in SmartSimError + with pytest.raises(SmartSimError) as ex: + publisher.send(event) + + assert "unexpected" in ex.value.args[0] + + +def test_eventconsumer_receive(test_dir: str) -> None: + """Verify that a consumer retrieves a message from the given channel + + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs""" + storage_path = pathlib.Path(test_dir) / "features" + storage_path.mkdir(parents=True, exist_ok=True) + + mock_storage = {} + + # note: file-system descriptors are just paths + target_descriptor = str(storage_path / "test-consumer") + + backbone = BackboneFeatureStore(mock_storage) + comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor) + event = OnCreateConsumer(target_descriptor) + + # simulate a sent event by writing directly to the input comm channel + comm_channel.send(bytes(event)) + + consumer = EventConsumer(comm_channel, backbone) + + all_received: t.List[OnCreateConsumer] = consumer.receive() + assert len(all_received) == 1 + + # verify we received the same event that was raised + assert all_received[0].category == event.category + assert all_received[0].descriptor == event.descriptor + + +@pytest.mark.parametrize("num_sent", [0, 1, 2, 4, 8, 16]) +def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None: + """Verify that a consumer retrieves multiple message from the given channel + + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs + :param num_sent: parameterized value used to vary the number of events + that are enqueued and validations are checked at multiple queue sizes""" + storage_path = pathlib.Path(test_dir) / "features" + storage_path.mkdir(parents=True, exist_ok=True) + + mock_storage = {} + + # note: file-system descriptors are just paths + target_descriptor = str(storage_path / "test-consumer") + + backbone = BackboneFeatureStore(mock_storage) + comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor) + + # simulate multiple sent events by writing directly to the input comm channel + for _ in range(num_sent): + event = OnCreateConsumer(target_descriptor) + comm_channel.send(bytes(event)) + + consumer = EventConsumer(comm_channel, backbone) + + all_received: t.List[OnCreateConsumer] = consumer.receive() + assert len(all_received) == num_sent + + +def test_eventconsumer_receive_empty(test_dir: str) -> None: + """Verify that a consumer receiving an empty message ignores the + message and continues processing + + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs""" + storage_path = pathlib.Path(test_dir) / "features" + storage_path.mkdir(parents=True, exist_ok=True) + + mock_storage = {} + + # note: file-system descriptors are just paths + target_descriptor = str(storage_path / "test-consumer") + + backbone = BackboneFeatureStore(mock_storage) + comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor) + + # simulate a sent event by writing directly to the input comm channel + comm_channel.send(bytes(b"")) + + consumer = EventConsumer(comm_channel, backbone) + + messages = consumer.receive() + + # the messages array should be empty + assert not messages + + +def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None: + """Verify that the publisher and consumer integrate as expected when + multiple publishers and consumers are sending simultaneously. + + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs""" + storage_path = pathlib.Path(test_dir) / "features" + storage_path.mkdir(parents=True, exist_ok=True) + + mock_storage = {} + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + mock_fs_descriptor = str(storage_path / f"mock-feature-store") + + wmgr_channel = FileSystemCommChannel(storage_path / "test-wmgr") + capp_channel = FileSystemCommChannel(storage_path / "test-capp") + back_channel = FileSystemCommChannel(storage_path / "test-backend") + + wmgr_consumer_descriptor = wmgr_channel.descriptor.decode("utf-8") + capp_consumer_descriptor = capp_channel.descriptor.decode("utf-8") + back_consumer_descriptor = back_channel.descriptor.decode("utf-8") + + # create some consumers to receive messages + wmgr_consumer = EventConsumer( + wmgr_channel, + backbone, + filters=[EventCategory.FEATURE_STORE_WRITTEN], + ) + capp_consumer = EventConsumer( + capp_channel, + backbone, + ) + back_consumer = EventConsumer( + back_channel, + backbone, + filters=[EventCategory.CONSUMER_CREATED], + ) + + # create some broadcasters to publish messages + mock_worker_mgr = EventBroadcaster( + backbone, + channel_factory=FileSystemCommChannel.from_descriptor, + ) + mock_client_app = EventBroadcaster( + backbone, + channel_factory=FileSystemCommChannel.from_descriptor, + ) + + # register all of the consumers even though the OnCreateConsumer really should + # trigger its registration. event processing is tested elsewhere. + backbone.notification_channels = [ + wmgr_consumer_descriptor, + capp_consumer_descriptor, + back_consumer_descriptor, + ] + + # simulate worker manager sending a notification to backend that it's alive + event_1 = OnCreateConsumer(wmgr_consumer_descriptor) + mock_worker_mgr.send(event_1) + + # simulate the app updating a model a few times + event_2 = OnWriteFeatureStore(mock_fs_descriptor, "key-1") + event_3 = OnWriteFeatureStore(mock_fs_descriptor, "key-2") + event_4 = OnWriteFeatureStore(mock_fs_descriptor, "key-1") + + mock_client_app.send(event_2) + mock_client_app.send(event_3) + mock_client_app.send(event_4) + + # worker manager should only get updates about feature update + wmgr_messages = wmgr_consumer.receive() + assert len(wmgr_messages) == 3 + + # the backend should only receive messages about consumer creation + back_messages = back_consumer.receive() + assert len(back_messages) == 1 + + # hypothetical app has no filters and will get all events + app_messages = capp_consumer.receive() + assert len(app_messages) == 4 + + +@pytest.mark.parametrize("invalid_timeout", [-100.0, -1.0, 0.0]) +def test_eventconsumer_batch_timeout( + invalid_timeout: float, + test_dir: str, +) -> None: + """Verify that a consumer allows only positive, non-zero values for timeout + if it is supplied. + + :param invalid_timeout: any invalid timeout that should fail validation + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs""" + storage_path = pathlib.Path(test_dir) / "features" + storage_path.mkdir(parents=True, exist_ok=True) + + mock_storage = {} + backbone = BackboneFeatureStore(mock_storage) + + channel = FileSystemCommChannel(storage_path / "test-wmgr") + + with pytest.raises(ValueError) as ex: + # try to create a consumer w/a max recv size of 0 + EventConsumer( + channel, + backbone, + filters=[EventCategory.FEATURE_STORE_WRITTEN], + batch_timeout=invalid_timeout, + ) + + assert "positive" in ex.value.args[0] diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py new file mode 100644 index 0000000000..59801eebe2 --- /dev/null +++ b/tests/dragon/test_featurestore_integration.py @@ -0,0 +1,267 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + +import pytest + +dragon = pytest.importorskip("dragon") + +from smartsim._core.mli.comm.channel.dragon_channel import ( + DEFAULT_CHANNEL_BUFFER_SIZE, + DragonCommChannel, + create_local, +) +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, + EventBroadcaster, + EventCategory, + EventConsumer, + OnCreateConsumer, + OnWriteFeatureStore, +) +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict + +# isort: off +from dragon.channels import Channel + +# isort: on + +if t.TYPE_CHECKING: + import conftest + + +# The tests in this file must run in a dragon environment +pytestmark = pytest.mark.dragon + + +@pytest.fixture +def storage_for_dragon_fs() -> t.Dict[str, str]: + return dragon_ddict.DDict() + + +def test_eventconsumer_eventpublisher_integration( + storage_for_dragon_fs: t.Any, test_dir: str +) -> None: + """Verify that the publisher and consumer integrate as expected when + multiple publishers and consumers are sending simultaneously. This + test closely tracks the test in tests/test_featurestore.py also named + test_eventconsumer_eventpublisher_integration but requires dragon entities + + :param storage_for_dragon_fs: the dragon storage engine to use + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs""" + + mock_storage = storage_for_dragon_fs + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + mock_fs_descriptor = backbone.descriptor + + # verify ability to write and read from ddict + backbone["test_dir"] = test_dir + assert backbone["test_dir"] == test_dir + + wmgr_channel_ = Channel.make_process_local() + capp_channel_ = Channel.make_process_local() + back_channel_ = Channel.make_process_local() + + wmgr_channel = DragonCommChannel(wmgr_channel_) + capp_channel = DragonCommChannel(capp_channel_) + back_channel = DragonCommChannel(back_channel_) + + wmgr_consumer_descriptor = wmgr_channel.descriptor_string + capp_consumer_descriptor = capp_channel.descriptor_string + back_consumer_descriptor = back_channel.descriptor_string + + # create some consumers to receive messages + wmgr_consumer = EventConsumer( + wmgr_channel, + backbone, + filters=[EventCategory.FEATURE_STORE_WRITTEN], + ) + capp_consumer = EventConsumer( + capp_channel, + backbone, + ) + back_consumer = EventConsumer( + back_channel, + backbone, + filters=[EventCategory.CONSUMER_CREATED], + ) + + # create some broadcasters to publish messages + mock_worker_mgr = EventBroadcaster( + backbone, + channel_factory=DragonCommChannel.from_descriptor, + ) + mock_client_app = EventBroadcaster( + backbone, + channel_factory=DragonCommChannel.from_descriptor, + ) + + # register all of the consumers even though the OnCreateConsumer really should + # trigger its registration. event processing is tested elsewhere. + backbone.notification_channels = [ + wmgr_consumer_descriptor, + capp_consumer_descriptor, + back_consumer_descriptor, + ] + + # simulate worker manager sending a notification to backend that it's alive + event_1 = OnCreateConsumer(wmgr_consumer_descriptor) + mock_worker_mgr.send(event_1) + + # simulate the app updating a model a few times + for key in ["key-1", "key-2", "key-1"]: + event = OnWriteFeatureStore(backbone.descriptor, key) + mock_client_app.send(event, timeout=0.1) + + # worker manager should only get updates about feature update + wmgr_messages = wmgr_consumer.receive() + assert len(wmgr_messages) == 3 + + # the backend should only receive messages about consumer creation + back_messages = back_consumer.receive() + assert len(back_messages) == 1 + + # hypothetical app has no filters and will get all events + app_messages = capp_consumer.receive() + assert len(app_messages) == 4 + + +@pytest.mark.parametrize( + "num_events, batch_timeout", + [ + pytest.param(1, 1.0, id="under 1s timeout"), + pytest.param(20, 1.0, id="test 1s timeout w/20"), + pytest.param(50, 1.0, id="test 1s timeout w/50"), + pytest.param(60, 0.1, id="small batches"), + pytest.param(100, 0.1, id="many small batches"), + ], +) +def test_eventconsumer_max_dequeue( + num_events: int, + batch_timeout: float, + storage_for_dragon_fs: t.Any, +) -> None: + """Verify that a consumer does not sit and collect messages indefinitely + by checking that a consumer returns after a maximum timeout is exceeded + + :param num_events: the total number of events to raise in the test + :param batch_timeout: the maximum wait time for a message to be sent. + :param storage_for_dragon_fs: the dragon storage engine to use""" + + mock_storage = storage_for_dragon_fs + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + + wmgr_channel_ = Channel.make_process_local() + wmgr_channel = DragonCommChannel(wmgr_channel_) + wmgr_consumer_descriptor = wmgr_channel.descriptor_string + + # create some consumers to receive messages + wmgr_consumer = EventConsumer( + wmgr_channel, + backbone, + filters=[EventCategory.FEATURE_STORE_WRITTEN], + batch_timeout=batch_timeout, + ) + + # create a broadcaster to publish messages + mock_client_app = EventBroadcaster( + backbone, + channel_factory=DragonCommChannel.from_descriptor, + ) + + # register all of the consumers even though the OnCreateConsumer really should + # trigger its registration. event processing is tested elsewhere. + backbone.notification_channels = [wmgr_consumer_descriptor] + + # simulate the app updating a model a lot of times + for key in (f"key-{i}" for i in range(num_events)): + event = OnWriteFeatureStore(backbone.descriptor, key) + mock_client_app.send(event, timeout=0.1) + + num_dequeued = 0 + + while wmgr_messages := wmgr_consumer.receive(timeout=0.01): + # worker manager should not get more than `max_num_msgs` events + num_dequeued += len(wmgr_messages) + + # make sure we made all the expected dequeue calls and got everything + assert num_dequeued == num_events + + +@pytest.mark.parametrize( + "buffer_size", + [ + pytest.param(-1, id="use default: 500"), + pytest.param(0, id="use default: 500"), + pytest.param(1, id="non-zero buffer size: 1"), + pytest.param(500, id="buffer size: 500"), + pytest.param(1000, id="buffer size: 1000"), + ], +) +def test_channel_buffer_size( + buffer_size: int, + storage_for_dragon_fs: t.Any, +) -> None: + """Verify that a channel used by an EventBroadcaster can buffer messages + until a configured maximum value is exceeded. + + :param buffer_size: the maximum number of messages allowed in a channel buffer + :param storage_for_dragon_fs: the dragon storage engine to use""" + + mock_storage = storage_for_dragon_fs + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + + wmgr_channel_ = create_local(buffer_size) # <--- vary buffer size + wmgr_channel = DragonCommChannel(wmgr_channel_) + wmgr_consumer_descriptor = wmgr_channel.descriptor_string + + # create a broadcaster to publish messages. create no consumers to + # push the number of sent messages past the allotted buffer size + mock_client_app = EventBroadcaster( + backbone, + channel_factory=DragonCommChannel.from_descriptor, + ) + + # register all of the consumers even though the OnCreateConsumer really should + # trigger its registration. event processing is tested elsewhere. + backbone.notification_channels = [wmgr_consumer_descriptor] + + if buffer_size < 1: + # NOTE: we set this after creating the channel above to ensure + # the default parameter value was used during instantiation + buffer_size = DEFAULT_CHANNEL_BUFFER_SIZE + + # simulate the app updating a model a lot of times + for key in (f"key-{i}" for i in range(buffer_size)): + event = OnWriteFeatureStore(backbone.descriptor, key) + mock_client_app.send(event, timeout=0.1) + + # adding 1 more over the configured buffer size should report the error + with pytest.raises(Exception) as ex: + mock_client_app.send(event, timeout=0.1) diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index eeb8cd238b..ccdbce58c3 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -121,8 +121,8 @@ def mock_messages( for iteration_number in range(2): - channel_key = Channel.make_process_local().serialize() - callback_channel = DragonCommChannel(channel_key) + channel = Channel.make_process_local() + callback_channel = DragonCommChannel(channel) input_path = feature_store_root_dir / f"{iteration_number}/input.pt" output_path = feature_store_root_dir / f"{iteration_number}/output.pt" @@ -144,7 +144,7 @@ def mock_messages( message_model_key = MessageHandler.build_model_key(model_key, fsd) request = MessageHandler.build_request( - reply_channel=base64.b64encode(callback_channel.descriptor).decode("utf-8"), + reply_channel=base64.b64encode(channel.serialize()).decode("utf-8"), model=message_model_key, inputs=[tensor_desc], outputs=[message_tensor_output_key], diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py index 08b659c072..6cde6258f2 100644 --- a/tests/dragon/utils/channel.py +++ b/tests/dragon/utils/channel.py @@ -24,11 +24,13 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import base64 import pathlib import threading import typing as t from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim.error.errors import SmartSimError from smartsim.log import get_logger logger = get_logger(__name__) @@ -42,7 +44,6 @@ def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: :param key: a path to the root directory of the feature store""" self._lock = threading.RLock() - if not isinstance(key, bytes): super().__init__(key.as_posix().encode("utf-8")) self._file_path = key @@ -55,25 +56,56 @@ def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: self._file_path.touch() - def send(self, value: bytes) -> None: + def send(self, value: bytes, timeout: float = 0) -> None: """Send a message throuh the underlying communication channel + :param timeout: maximum time to wait (in seconds) for messages to send :param value: The value to send""" - logger.debug( - f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}" - ) with self._lock: - self._file_path.write_bytes(value) + # write as text so we can add newlines as delimiters + with open(self._file_path, "a") as fp: + encoded_value = base64.b64encode(value).decode("utf-8") + fp.write(f"{encoded_value}\n") + logger.debug(f"FileSystemCommChannel {self._file_path} sent message") + + def recv(self, timeout: float = 0) -> t.List[bytes]: + """Receives message(s) through the underlying communication channel + + :param timeout: maximum time to wait (in seconds) for messages to arrive + :returns: the received message + :raises SmartSimError: if the descriptor points to a missing file""" + with self._lock: + messages: t.List[bytes] = [] + if not self._file_path.exists(): + raise SmartSimError("Empty channel") - def recv(self) -> bytes: - """Receieve a message through the underlying communication channel + # read as text so we can split on newlines + with open(self._file_path, "r") as fp: + lines = fp.readlines() - :returns: the received message""" - with self._lock: - if self._file_path.exists(): - incoming = self._file_path.read_bytes() - self._file_path.unlink() - return incoming + if lines: + line = lines.pop(0) + event_bytes = base64.b64decode(line.encode("utf-8")) + messages.append(event_bytes) + + self.clear() + + # remove the first message only, write remainder back... + if len(lines) > 0: + with open(self._file_path, "w") as fp: + fp.writelines(lines) + + logger.debug( + f"FileSystemCommChannel {self._file_path} received message" + ) + + return messages + + def clear(self) -> None: + """Create an empty file for events""" + if self._file_path.exists(): + self._file_path.unlink() + self._file_path.touch() @classmethod def from_descriptor( @@ -91,4 +123,5 @@ def from_descriptor( path = pathlib.Path(descriptor.decode("utf-8")) return FileSystemCommChannel(path) except: - print("failed to create FS comm channel: {descriptor}") + logger.warning(f"failed to create fs comm channel: {descriptor!r}") + raise diff --git a/tests/mli/channel.py b/tests/mli/channel.py index 226e8683dd..2348784236 100644 --- a/tests/mli/channel.py +++ b/tests/mli/channel.py @@ -24,11 +24,13 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import base64 import pathlib import threading import typing as t from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim.error.errors import SmartSimError from smartsim.log import get_logger logger = get_logger(__name__) @@ -42,7 +44,7 @@ def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: :param key: a path to the root directory of the feature store""" self._lock = threading.RLock() - if not isinstance(key, bytes): + if isinstance(key, pathlib.Path): super().__init__(key.as_posix().encode("utf-8")) self._file_path = key else: @@ -54,38 +56,72 @@ def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: self._file_path.touch() - def send(self, value: bytes) -> None: + def send(self, value: bytes, timeout: float = 0) -> None: """Send a message throuh the underlying communication channel + :param timeout: maximum time to wait (in seconds) for messages to send :param value: The value to send""" - logger.debug( - f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}" - ) with self._lock: - self._file_path.write_bytes(value) + # write as text so we can add newlines as delimiters + with open(self._file_path, "a") as fp: + encoded_value = base64.b64encode(value).decode("utf-8") + fp.write(f"{encoded_value}\n") + logger.debug(f"FileSystemCommChannel {self._file_path} sent message") - def recv(self) -> bytes: - """Receieve a message through the underlying communication channel + def recv(self, timeout: float = 0) -> t.List[bytes]: + """Receives message(s) through the underlying communication channel - :returns: the received message""" + :param timeout: maximum time to wait (in seconds) for messages to arrive + :returns: the received message + :raises SmartSimError: if the descriptor points to a missing file""" with self._lock: - if self._file_path.exists(): - incoming = self._file_path.read_bytes() - self._file_path.unlink() - return incoming + messages: t.List[bytes] = [] + if not self._file_path.exists(): + raise SmartSimError("Empty channel") + + # read as text so we can split on newlines + with open(self._file_path, "r") as fp: + lines = fp.readlines() + + if lines: + line = lines.pop(0) + event_bytes = base64.b64decode(line.encode("utf-8")) + messages.append(event_bytes) + + self.clear() + + # remove the first message only, write remainder back... + if len(lines) > 0: + with open(self._file_path, "w") as fp: + fp.writelines(lines) + + logger.debug( + f"FileSystemCommChannel {self._file_path} received message" + ) + + return messages + + def clear(self) -> None: + """Create an empty file for events""" + if self._file_path.exists(): + self._file_path.unlink() + self._file_path.touch() @classmethod def from_descriptor( cls, - descriptor: str, + descriptor: t.Union[str, bytes], ) -> "FileSystemCommChannel": """A factory method that creates an instance from a descriptor string :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached FileSystemCommChannel""" try: - path = pathlib.Path(descriptor) + if isinstance(descriptor, str): + path = pathlib.Path(descriptor) + else: + path = pathlib.Path(descriptor.decode("utf-8")) return FileSystemCommChannel(path) except: - print(f"failed to create fs comm channel: {descriptor}") + logger.warning(f"failed to create fs comm channel: {descriptor}") raise diff --git a/tests/mli/feature_store.py b/tests/mli/feature_store.py index 7ecc01814c..7bc18253c8 100644 --- a/tests/mli/feature_store.py +++ b/tests/mli/feature_store.py @@ -37,11 +37,14 @@ class MemoryFeatureStore(FeatureStore): """A feature store with values persisted only in local memory""" - def __init__(self) -> None: + def __init__(self, storage: t.Optional[t.Dict[str, bytes]] = None) -> None: """Initialize the MemoryFeatureStore instance""" - self._storage: t.Dict[str, bytes] = {} + super().__init__("in-memory-fs") + if storage is None: + storage = {"_": "abc"} + self._storage: t.Dict[str, bytes] = storage - def __getitem__(self, key: str) -> bytes: + def _get(self, key: str) -> bytes: """Retrieve an item using key :param key: Unique key of an item to retrieve from the feature store""" @@ -49,42 +52,35 @@ def __getitem__(self, key: str) -> bytes: raise sse.SmartSimError(f"{key} not found in feature store") return self._storage[key] - def __setitem__(self, key: str, value: bytes) -> None: + def _set(self, key: str, value: bytes) -> None: """Membership operator to test for a key existing within the feature store. :param key: Unique key of an item to retrieve from the feature store :returns: `True` if the key is found, `False` otherwise""" + self._check_reserved(key) self._storage[key] = value - def __contains__(self, key: str) -> bool: + def _contains(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. Return `True` if the key is found, `False` otherwise :param key: Unique key of an item to retrieve from the feature store""" return key in self._storage - @property - def descriptor(self) -> str: - """Unique identifier enabling a client to connect to the feature store - - :returns: A descriptor encoded as a string""" - return "in-memory-fs" - class FileSystemFeatureStore(FeatureStore): """Alternative feature store implementation for testing. Stores all data on the file system""" - def __init__( - self, storage_dir: t.Optional[t.Union[pathlib.Path, str]] = None - ) -> None: + def __init__(self, storage_dir: t.Union[pathlib.Path, str] = None) -> None: """Initialize the FileSystemFeatureStore instance :param storage_dir: (optional) root directory to store all data relative to""" if isinstance(storage_dir, str): storage_dir = pathlib.Path(storage_dir) self._storage_dir = storage_dir + super().__init__(storage_dir.as_posix()) - def __getitem__(self, key: str) -> bytes: + def _get(self, key: str) -> bytes: """Retrieve an item using key :param key: Unique key of an item to retrieve from the feature store""" @@ -93,15 +89,17 @@ def __getitem__(self, key: str) -> bytes: raise sse.SmartSimError(f"{path} not found in feature store") return path.read_bytes() - def __setitem__(self, key: str, value: bytes) -> None: + def _set(self, key: str, value: bytes) -> None: """Assign a value using key :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" path = self._key_path(key, create=True) + if isinstance(value, str): + value = value.encode("utf-8") path.write_bytes(value) - def __contains__(self, key: str) -> bool: + def _contains(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. :param key: Unique key of an item to retrieve from the feature store @@ -124,15 +122,6 @@ def _key_path(self, key: str, create: bool = False) -> pathlib.Path: return value - @property - def descriptor(self) -> str: - """Unique identifier enabling a client to connect to the feature store - - :returns: A descriptor encoded as a string""" - if not self._storage_dir: - raise ValueError("No storage path configured") - return self._storage_dir.as_posix() - @classmethod def from_descriptor( cls, diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py index ea9b04d649..8be9c11a67 100644 --- a/tests/test_message_handler/test_request.py +++ b/tests/test_message_handler/test_request.py @@ -203,9 +203,9 @@ def test_build_request_indirect_successful( id="bad inputs", ), pytest.param( - b"reply channel", + "reply channel", model_key, - [model_key], + [torch_attributes], [output_key1, output_key2], [output_descriptor1], torch_attributes, @@ -221,10 +221,10 @@ def test_build_request_indirect_successful( id="bad outputs", ), pytest.param( - b"reply channel", + "reply channel", model_key, [input_key1], - [model_key], + [torch_attributes], [output_descriptor1], tf_attributes, id="bad output schema type", From 28bfd8fa0892e6a2bc4f5b805809af6a31e67b00 Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Tue, 10 Sep 2024 11:52:35 -0700 Subject: [PATCH 25/60] Update MLI docstrings part 1 (#692) Part 1 of updating docstrings in the MLI. [ committed by @AlyssaCote ] [ reviewed by @mellis13 ] --- doc/changelog.md | 1 + .../infrastructure/control/device_manager.py | 48 +++--- .../infrastructure/control/error_handling.py | 7 + .../control/request_dispatcher.py | 81 +++++++--- .../infrastructure/control/worker_manager.py | 16 +- .../mli/infrastructure/environment_loader.py | 2 +- .../_core/mli/infrastructure/worker/worker.py | 149 ++++++++++++++---- smartsim/_core/mli/message_handler.py | 61 ++++--- 8 files changed, 260 insertions(+), 105 deletions(-) diff --git a/doc/changelog.md b/doc/changelog.md index b6f134d2a5..17fed285cc 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Update docstrings - Implement asynchronous notifications for shared data - Filenames conform to snake case - Update SmartSim environment variables using new naming convention diff --git a/smartsim/_core/mli/infrastructure/control/device_manager.py b/smartsim/_core/mli/infrastructure/control/device_manager.py index 54d58507ee..be0a05d064 100644 --- a/smartsim/_core/mli/infrastructure/control/device_manager.py +++ b/smartsim/_core/mli/infrastructure/control/device_manager.py @@ -36,8 +36,9 @@ class WorkerDevice: def __init__(self, name: str) -> None: - """Wrapper around a device to keep track of loaded Models and availability - :param name: name used by the toolkit to identify this device, e.g. ``cuda:0`` + """Wrapper around a device to keep track of loaded Models and availability. + + :param name: Name used by the toolkit to identify this device, e.g. ``cuda:0`` """ self._name = name """The name used by the toolkit to identify this device""" @@ -46,11 +47,14 @@ def __init__(self, name: str) -> None: @property def name(self) -> str: - """The identifier of the device represented by this object""" + """The identifier of the device represented by this object + + :return: Name used by the toolkit to identify this device + """ return self._name def add_model(self, key: str, model: t.Any) -> None: - """Add a reference to a model loaded on this device and assign it a key + """Add a reference to a model loaded on this device and assign it a key. :param key: The key under which the model is saved :param model: The model which is added @@ -58,30 +62,35 @@ def add_model(self, key: str, model: t.Any) -> None: self._models[key] = model def remove_model(self, key: str) -> None: - """Remove the reference to a model loaded on this device + """Remove the reference to a model loaded on this device. :param key: The key of the model to remove """ self._models.pop(key) def get_model(self, key: str) -> t.Any: - """Get the model corresponding to a given key + """Get the model corresponding to a given key. - :param key: the model key - :returns: the model for the given key + :param key: The model key + :returns: The model for the given key """ return self._models[key] def __contains__(self, key: str) -> bool: - """Check if model with a given key is available on the device + """Check if model with a given key is available on the device. - :param key: the key of the model to check for existence - :returns: whether the model is available on the device + :param key: The key of the model to check for existence + :returns: Whether the model is available on the device """ return key in self._models @contextmanager def get(self, key_to_remove: t.Optional[str]) -> t.Iterator["WorkerDevice"]: + """Get the WorkerDevice generator and optionally remove a model. + + :param key_to_remove: The key of the model to optionally remove + :returns: WorkerDevice generator + """ yield self if key_to_remove is not None: self.remove_model(key_to_remove) @@ -93,7 +102,8 @@ def __init__(self, device: WorkerDevice): The main goal of the ``DeviceManager`` is to ensure that the managed device is ready to be used by a worker to - run a given model + run a given model. + :param device: The managed device """ self._device = device @@ -105,13 +115,13 @@ def _load_model_on_device( batch: RequestBatch, feature_stores: dict[str, FeatureStore], ) -> None: - """Load the model needed to execute on a batch on the managed device. + """Load the model needed to execute a batch on the managed device. The model is loaded by the worker. - :param worker: the worker that loads the model - :param batch: the batch for which the model is needed - :param feature_stores: feature stores where the model could be stored + :param worker: The worker that loads the model + :param batch: The batch for which the model is needed + :param feature_stores: Feature stores where the model could be stored """ model_bytes = worker.fetch_model(batch, feature_stores) @@ -124,10 +134,10 @@ def get_device( batch: RequestBatch, feature_stores: dict[str, FeatureStore], ) -> _GeneratorContextManager[WorkerDevice]: - """Get the device managed by this object + """Get the device managed by this object. - the model needed to run the batch of requests is - guaranteed to be available on the model + The model needed to run the batch of requests is + guaranteed to be available on the device. :param worker: The worker that wants to access the device :param batch: The batch of requests diff --git a/smartsim/_core/mli/infrastructure/control/error_handling.py b/smartsim/_core/mli/infrastructure/control/error_handling.py index 5a42a8bfa8..30cffb8c6b 100644 --- a/smartsim/_core/mli/infrastructure/control/error_handling.py +++ b/smartsim/_core/mli/infrastructure/control/error_handling.py @@ -38,6 +38,13 @@ def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: + """ + Builds a failure response message. + + :param status: Status enum + :param message: Status message + :return: Failure response + """ return MessageHandler.build_response( status=status, message=message, diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index 21fd98893d..2b1bf58952 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -69,9 +69,10 @@ def __init__( ) -> None: """Queue used to store inference requests waiting to be batched and sent to Worker Managers. + :param batch_timeout: Time in seconds that has to be waited before flushing a non-full queue. The time of the first item put is 0 seconds. - :param batch_size: Total capacity of the queue. + :param batch_size: Total capacity of the queue :param model_id: Key of the model which needs to be executed on the queued requests """ @@ -93,12 +94,18 @@ def __init__( @property def uid(self) -> str: - """ID of this queue""" + """ID of this queue. + + :return: Queue ID + """ return self._uid @property def model_id(self) -> ModelIdentifier: - """Key of the model which needs to be run on the queued requests""" + """Key of the model which needs to be run on the queued requests. + + :return: Model key + """ return self._model_id def put( @@ -107,7 +114,8 @@ def put( block: bool = False, timeout: t.Optional[float] = 0.0, ) -> None: - """Put an inference request in the queue + """Put an inference request in the queue. + :param item: The request :param block: Whether to block when trying to put the item :param timeout: Time (in seconds) to wait if block==True @@ -119,14 +127,20 @@ def put( @property def _elapsed_time(self) -> float: - """Time elapsed since the first item was put on this queue""" + """Time elapsed since the first item was put on this queue. + + :return: Time elapsed + """ if self.empty() or self._first_put is None: return 0 return time.time() - self._first_put @property def ready(self) -> bool: - """True if the queue can be flushed""" + """Check if the queue can be flushed. + + :return: True if the queue can be flushed, False otherwise + """ if self.empty(): return False @@ -142,11 +156,15 @@ def make_disposable(self) -> None: @property def can_be_removed(self) -> bool: - """Whether this queue can be deleted and garbage collected""" + """Determine whether this queue can be deleted and garbage collected. + + :return: True if queue can be removed, False otherwise + """ return self.empty() and self._disposable def flush(self) -> list[t.Any]: - """Get all requests from queue + """Get all requests from queue. + :return: Requests waiting to be executed """ num_items = self.qsize() @@ -161,13 +179,20 @@ def flush(self) -> list[t.Any]: return items def full(self) -> bool: - """Return True if the queue has reached its maximum capacity""" + """Check if the queue has reached its maximum capacity. + + :return: True if the queue has reached its maximum capacity, + False otherwise + """ if self._disposable: return True return self.qsize() >= self._batch_size def empty(self) -> bool: - """Return True if the queue has 0 elements""" + """Check if the queue is empty. + + :return: True if the queue has 0 elements, False otherwise + """ return self.qsize() == 0 @@ -183,9 +208,10 @@ def __init__( """The RequestDispatcher intercepts inference requests, stages them in queues and batches them together before making them available to Worker Managers. + :param batch_timeout: Maximum elapsed time before flushing a complete or incomplete batch - :param batch_size: Total capacity of each batch queue. + :param batch_size: Total capacity of each batch queue :param mem_pool: Memory pool used to share batched input tensors with worker managers :param config_loader: Object to load configuration from environment @@ -227,7 +253,7 @@ def __init__( """Performance timer""" def _check_feature_stores(self, request: InferenceRequest) -> bool: - """Ensures that all feature stores required by the request are available + """Ensures that all feature stores required by the request are available. :param request: The request to validate :returns: False if feature store validation fails for the request, True @@ -260,7 +286,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool: # pylint: disable-next=no-self-use def _check_model(self, request: InferenceRequest) -> bool: - """Ensure that a model is available for the request + """Ensure that a model is available for the request. :param request: The request to validate :returns: False if model validation fails for the request, True otherwise @@ -273,7 +299,7 @@ def _check_model(self, request: InferenceRequest) -> bool: # pylint: disable-next=no-self-use def _check_inputs(self, request: InferenceRequest) -> bool: - """Ensure that inputs are available for the request + """Ensure that inputs are available for the request. :param request: The request to validate :returns: False if input validation fails for the request, True otherwise @@ -286,7 +312,7 @@ def _check_inputs(self, request: InferenceRequest) -> bool: # pylint: disable-next=no-self-use def _check_callback(self, request: InferenceRequest) -> bool: - """Ensure that a callback channel is available for the request + """Ensure that a callback channel is available for the request. :param request: The request to validate :returns: False if callback validation fails for the request, True otherwise @@ -298,7 +324,7 @@ def _check_callback(self, request: InferenceRequest) -> bool: return False def _validate_request(self, request: InferenceRequest) -> bool: - """Ensure the request can be processed + """Ensure the request can be processed. :param request: The request to validate :return: False if the request fails any validation checks, True otherwise""" @@ -362,7 +388,7 @@ def _on_iteration(self) -> None: def remove_queues(self) -> None: """Remove references to queues that can be removed - and allow them to be garbage collected""" + and allow them to be garbage collected.""" queue_lists_to_remove = [] for key, queues in self._queues.items(): queues_to_remove = [] @@ -386,13 +412,16 @@ def remove_queues(self) -> None: @property def task_queue(self) -> DragonQueue: - """The queue on which batched requests are placed""" + """The queue on which batched requests are placed. + + :return: The queue + """ return self._outgoing_queue def _swap_queue(self, model_id: ModelIdentifier) -> None: """Get an empty queue or create a new one - and make it the active one for a given model. + :param model_id: The id of the model for which the queue has to be swapped """ @@ -411,8 +440,9 @@ def _swap_queue(self, model_id: ModelIdentifier) -> None: return def dispatch(self, request: InferenceRequest) -> None: - """Assign a request to a batch queue - :param request: the request to place + """Assign a request to a batch queue. + + :param request: The request to place """ if request.raw_model is not None: logger.debug("Direct inference requested, creating tmp queue") @@ -439,7 +469,7 @@ def dispatch(self, request: InferenceRequest) -> None: def flush_requests(self) -> None: """Get all requests from queues which are ready to be flushed. Place all - avaliable request batches in the outgoing queue. + available request batches in the outgoing queue. """ for queue_list in self._queues.values(): for queue in queue_list: @@ -497,11 +527,14 @@ def flush_requests(self) -> None: self._perf_timer.measure_time("put") def _can_shutdown(self) -> bool: - """Whether the Service can be shut down""" + """Determine whether the Service can be shut down. + + :return: False + """ return False def __del__(self) -> None: - """Destroy allocated memory resources""" + """Destroy allocated memory resources.""" # pool may be null if a failure occurs prior to successful attach pool: t.Optional[MemoryPool] = getattr(self, "_mem_pool", None) diff --git a/smartsim/_core/mli/infrastructure/control/worker_manager.py b/smartsim/_core/mli/infrastructure/control/worker_manager.py index d831a879aa..8136be5974 100644 --- a/smartsim/_core/mli/infrastructure/control/worker_manager.py +++ b/smartsim/_core/mli/infrastructure/control/worker_manager.py @@ -62,7 +62,7 @@ class WorkerManager(Service): """An implementation of a service managing distribution of tasks to - machine learning workers""" + machine learning workers.""" def __init__( self, @@ -73,7 +73,7 @@ def __init__( cooldown: int = 0, device: t.Literal["cpu", "gpu"] = "cpu", ) -> None: - """Initialize the WorkerManager + """Initialize the WorkerManager. :param config_loader: Environment config loader for loading queues and feature stores @@ -115,7 +115,7 @@ def _on_start(self) -> None: self._device_manager = DeviceManager(WorkerDevice(self._device)) def _check_feature_stores(self, batch: RequestBatch) -> bool: - """Ensures that all feature stores required by the request are available + """Ensures that all feature stores required by the request are available. :param batch: The batch of requests to validate :returns: False if feature store validation fails for the batch, True otherwise @@ -146,7 +146,7 @@ def _check_feature_stores(self, batch: RequestBatch) -> bool: return True def _validate_batch(self, batch: RequestBatch) -> bool: - """Ensure the request can be processed + """Ensure the request can be processed. :param batch: The batch of requests to validate :return: False if the request fails any validation checks, True otherwise""" @@ -160,8 +160,7 @@ def _validate_batch(self, batch: RequestBatch) -> bool: # pylint: disable-next=too-many-statements def _on_iteration(self) -> None: """Executes calls to the machine learning worker implementation to complete - - the inference pipeline""" + the inference pipeline.""" pre_batch_time = time.perf_counter() try: @@ -310,7 +309,10 @@ def _on_iteration(self) -> None: self._perf_timer.print_timings(True) def _can_shutdown(self) -> bool: - """Return true when the criteria to shut down the service are met.""" + """Determine if the service can be shutdown. + + :return: True when criteria to shutdown the service are met, False otherwise + """ # todo: determine shutdown criteria # will we receive a completion message? # will we let MLI mgr just kill this? diff --git a/smartsim/_core/mli/infrastructure/environment_loader.py b/smartsim/_core/mli/infrastructure/environment_loader.py index c8b158a5ad..364a3ebc9d 100644 --- a/smartsim/_core/mli/infrastructure/environment_loader.py +++ b/smartsim/_core/mli/infrastructure/environment_loader.py @@ -70,7 +70,7 @@ def __init__( def get_backbone(self) -> t.Optional[FeatureStore]: """Attach to the backbone feature store using the descriptor found in an environment variable. The backbone is a standalone, system-created - feature store used to share internal information among MLI components + feature store used to share internal information among MLI components. :returns: The attached feature store via _SMARTSIM_INFRA_BACKBONE""" descriptor = os.getenv("_SMARTSIM_INFRA_BACKBONE", "") diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 41de23b561..25a5ed0177 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -42,6 +42,7 @@ from ..storage.feature_store import FeatureStore, FeatureStoreKey if t.TYPE_CHECKING: + from smartsim._core.mli.mli_schemas.data.data_references_capnp import TensorKey from smartsim._core.mli.mli_schemas.response.response_capnp import Status from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor @@ -52,7 +53,7 @@ class InferenceRequest: - """Internal representation of an inference request from a client""" + """Internal representation of an inference request from a client.""" def __init__( self, @@ -65,7 +66,17 @@ def __init__( raw_model: t.Optional[Model] = None, batch_size: int = 0, ): - """Initialize the object""" + """Initialize the InferenceRequest. + + :param model_key: A tuple containing a (key, descriptor) pair + :param callback: The channel used for notification of inference completion + :param raw_inputs: Raw bytes of tensor inputs + :param input_keys: A list of tuples containing a (key, descriptor) pair + :param input_meta: Metadata about the input data + :param output_keys: A list of tuples containing a (key, descriptor) pair + :param raw_model: Raw bytes of an ML model + :param batch_size: The batch size to apply when batching + """ self.model_key = model_key """A tuple containing a (key, descriptor) pair""" self.raw_model = raw_model @@ -85,7 +96,7 @@ def __init__( class InferenceReply: - """Internal representation of the reply to a client request for inference""" + """Internal representation of the reply to a client request for inference.""" def __init__( self, @@ -94,18 +105,31 @@ def __init__( status_enum: "Status" = "running", message: str = "In progress", ) -> None: - """Initialize the object""" + """Initialize the InferenceReply. + + :param outputs: List of output data + :param output_keys: List of keys used for output data + :param status_enum: Status of the reply + :param message: Status message that corresponds with the status enum + """ self.outputs: t.Collection[t.Any] = outputs or [] + """List of output data""" self.output_keys: t.Collection[t.Optional[FeatureStoreKey]] = output_keys or [] + """List of keys used for output data""" self.status_enum = status_enum + """Status of the reply""" self.message = message + """Status message that corresponds with the status enum""" class LoadModelResult: - """A wrapper around a loaded model""" + """A wrapper around a loaded model.""" def __init__(self, model: t.Any) -> None: - """Initialize the object""" + """Initialize the LoadModelResult. + + :param model: The loaded model + """ self.model = model @@ -119,7 +143,15 @@ def __init__( dims: list[list[int]], dtypes: list[str], ) -> None: - """Initialize the object""" + """Initialize the TransformInputResult. + + :param result: List of Dragon MemoryAlloc objects on which + the tensors are stored + :param slices: The slices that represent which portion of the + input tensors belongs to which request + :param dims: Dimension of the transformed tensors + :param dtypes: Data type of transformed tensors + """ self.transformed = result """List of Dragon MemoryAlloc objects on which the tensors are stored""" self.slices = slices @@ -132,59 +164,94 @@ def __init__( class ExecuteResult: - """A wrapper around inference results""" + """A wrapper around inference results.""" def __init__(self, result: t.Any, slices: list[slice]) -> None: - """Initialize the object""" + """Initialize the ExecuteResult. + + :param result: Result of the execution + :param slices: The slices that represent which portion of the input + tensors belongs to which request + """ self.predictions = result + """Result of the execution""" self.slices = slices + """The slices that represent which portion of the input + tensors belongs to which request""" class FetchInputResult: - """A wrapper around fetched inputs""" + """A wrapper around fetched inputs.""" def __init__(self, result: t.List[bytes], meta: t.Optional[t.List[t.Any]]) -> None: - """Initialize the object""" + """Initialize the FetchInputResult. + + :param result: List of input tensor bytes + :param meta: List of metadata that corresponds with the inputs + """ self.inputs = result + """List of input tensor bytes""" self.meta = meta + """List of metadata that corresponds with the inputs""" class TransformOutputResult: - """A wrapper around inference results transformed for transmission""" + """A wrapper around inference results transformed for transmission.""" def __init__( self, result: t.Any, shape: t.Optional[t.List[int]], order: str, dtype: str ) -> None: - """Initialize the OutputTransformResult""" + """Initialize the TransformOutputResult. + + :param result: Transformed output results + :param shape: Shape of output results + :param order: Order of output results + :param dtype: Datatype of output results + """ self.outputs = result + """Transformed output results""" self.shape = shape + """Shape of output results""" self.order = order + """Order of output results""" self.dtype = dtype + """Datatype of output results""" class CreateInputBatchResult: - """A wrapper around inputs batched into a single request""" + """A wrapper around inputs batched into a single request.""" def __init__(self, result: t.Any) -> None: - """Initialize the object""" + """Initialize the CreateInputBatchResult. + + :param result: Inputs batched into a single request + """ self.batch = result + """Inputs batched into a single request""" class FetchModelResult: - """A wrapper around raw fetched models""" + """A wrapper around raw fetched models.""" def __init__(self, result: bytes) -> None: - """Initialize the object""" + """Initialize the FetchModelResult. + + :param result: The raw fetched model + """ self.model_bytes: bytes = result + """The raw fetched model""" @dataclass class RequestBatch: - """A batch of aggregated inference requests""" + """A batch of aggregated inference requests.""" requests: list[InferenceRequest] + """List of InferenceRequests in the batch""" inputs: t.Optional[TransformInputResult] + """Transformed batch of input tensors""" model_id: ModelIdentifier + """Model (key, descriptor) tuple""" @property def has_valid_requests(self) -> bool: @@ -196,7 +263,7 @@ def has_valid_requests(self) -> bool: @property def has_raw_model(self) -> bool: - """Returns whether the batch has a raw model + """Returns whether the batch has a raw model. :return: True if the batch has a raw model """ @@ -206,6 +273,7 @@ def has_raw_model(self) -> bool: def raw_model(self) -> t.Optional[t.Any]: """Returns the raw model to use to execute for this batch if it is available. + :return: A model if available, otherwise None""" if self.has_valid_requests: return self.requests[0].raw_model @@ -213,7 +281,7 @@ def raw_model(self) -> t.Optional[t.Any]: @property def input_keys(self) -> t.List[FeatureStoreKey]: - """All input keys available in this batch's requests + """All input keys available in this batch's requests. :return: All input keys belonging to requests in this batch""" keys = [] @@ -224,7 +292,7 @@ def input_keys(self) -> t.List[FeatureStoreKey]: @property def output_keys(self) -> t.List[FeatureStoreKey]: - """All output keys available in this batch's requests + """All output keys available in this batch's requests. :return: All output keys belonging to requests in this batch""" keys = [] @@ -235,14 +303,15 @@ def output_keys(self) -> t.List[FeatureStoreKey]: class MachineLearningWorkerCore: - """Basic functionality of ML worker that is shared across all worker types""" + """Basic functionality of ML worker that is shared across all worker types.""" @staticmethod def deserialize_message( data_blob: bytes, callback_factory: t.Callable[[bytes], CommChannelBase], ) -> InferenceRequest: - """Deserialize a message from a byte stream into an InferenceRequest + """Deserialize a message from a byte stream into an InferenceRequest. + :param data_blob: The byte stream to deserialize :param callback_factory: A factory method that can create an instance of the desired concrete comm channel type @@ -295,6 +364,13 @@ def deserialize_message( @staticmethod def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: + """Assemble the output information based on whether the output + information will be in the form of TensorKeys or TensorDescriptors. + + :param reply: The reply that the output belongs to + :return: The list of prepared outputs, depending on the output + information needed in the reply + """ prepared_outputs: t.List[t.Any] = [] if reply.output_keys: for value in reply.output_keys: @@ -316,13 +392,14 @@ def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: def fetch_model( batch: RequestBatch, feature_stores: t.Dict[str, FeatureStore] ) -> FetchModelResult: - """Given a resource key, retrieve the raw model from a feature store + """Given a resource key, retrieve the raw model from a feature store. + :param batch: The batch of requests that triggered the pipeline :param feature_stores: Available feature stores used for persistence :return: Raw bytes of the model - :raises SmartSimError: if neither a key or a model are provided or the + :raises SmartSimError: If neither a key or a model are provided or the model cannot be retrieved from the feature store - :raises ValueError: if a feature store is not available and a raw + :raises ValueError: If a feature store is not available and a raw model is not provided""" # All requests in the same batch share the model @@ -352,10 +429,11 @@ def fetch_inputs( batch: RequestBatch, feature_stores: t.Dict[str, FeatureStore] ) -> t.List[FetchInputResult]: """Given a collection of ResourceKeys, identify the physical location - and input metadata + and input metadata. + :param batch: The batch of requests that triggered the pipeline :param feature_stores: Available feature stores used for persistence - :return: the fetched input + :return: The fetched input :raises ValueError: If neither an input key or an input tensor are provided :raises SmartSimError: If a tensor for a given key cannot be retrieved""" fetch_results = [] @@ -398,7 +476,8 @@ def place_output( feature_stores: t.Dict[str, FeatureStore], ) -> t.Collection[t.Optional[FeatureStoreKey]]: """Given a collection of data, make it available as a shared resource in the - feature store + feature store. + :param request: The request that triggered the pipeline :param execute_result: Results from inference :param feature_stores: Available feature stores used for persistence @@ -431,10 +510,11 @@ def load_model( batch: RequestBatch, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: """Given a loaded MachineLearningModel, ensure it is loaded into - device memory + device memory. + :param request: The request that triggered the pipeline :param device: The device on which the model must be placed - :return: ModelLoadResult wrapping the model loaded for the request""" + :return: LoadModelResult wrapping the model loaded for the request""" @staticmethod @abstractmethod @@ -445,10 +525,11 @@ def transform_input( ) -> TransformInputResult: """Given a collection of data, perform a transformation on the data and put the raw tensor data on a MemoryPool allocation. + :param request: The request that triggered the pipeline :param fetch_result: Raw outputs from fetching inputs out of a feature store :param mem_pool: The memory pool used to access batched input tensors - :return: The transformed inputs wrapped in a InputTransformResult""" + :return: The transformed inputs wrapped in a TransformInputResult""" @staticmethod @abstractmethod @@ -458,7 +539,8 @@ def execute( transform_result: TransformInputResult, device: str, ) -> ExecuteResult: - """Execute an ML model on inputs transformed for use by the model + """Execute an ML model on inputs transformed for use by the model. + :param batch: The batch of requests that triggered the pipeline :param load_result: The result of loading the model onto device memory :param transform_result: The result of transforming inputs for model consumption @@ -472,6 +554,7 @@ def transform_output( ) -> t.List[TransformOutputResult]: """Given inference results, perform transformations required to transmit results to the requestor. + :param batch: The batch of requests that triggered the pipeline :param execute_result: The result of inference wrapped in an ExecuteResult :return: A list of transformed outputs""" diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index ee632e24ea..efc80c5195 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -48,7 +48,8 @@ def build_tensor_descriptor( :param order: Order of the tensor, such as row-major (c) or column-major (f) :param data_type: Data type of the tensor :param dimensions: Dimensions of the tensor - :raises ValueError: if building fails + :return: The TensorDescriptor + :raises ValueError: If building fails """ try: description = tensor_capnp.TensorDescriptor.new_message() @@ -56,9 +57,7 @@ def build_tensor_descriptor( description.dataType = data_type description.dimensions = dimensions except Exception as e: - raise ValueError( - "Error building tensor descriptor." - ) from e # TODO: create custom exception + raise ValueError("Error building tensor descriptor.") from e return description @@ -77,7 +76,8 @@ def build_output_tensor_descriptor( :param keys: List of TensorKeys to apply transorm descriptor to :param data_type: Tranform data type of the tensor :param dimensions: Transform dimensions of the tensor - :raises ValueError: if building fails + :return: The OutputDescriptor + :raises ValueError: If building fails """ try: description = tensor_capnp.OutputDescriptor.new_message() @@ -101,7 +101,8 @@ def build_tensor_key( :param key: String to set the TensorKey :param feature_store_descriptor: A descriptor identifying the feature store containing the key - :raises ValueError: if building fails + :return: The TensorKey + :raises ValueError: If building fails """ try: tensor_key = data_references_capnp.TensorKey.new_message() @@ -119,7 +120,8 @@ def build_model(data: bytes, name: str, version: str) -> model_capnp.Model: :param data: Model data :param name: Model name :param version: Model version - :raises ValueError: if building fails + :return: The Model + :raises ValueError: If building fails """ try: model = model_capnp.Model.new_message() @@ -140,7 +142,8 @@ def build_model_key( :param key: String to set the ModelKey :param feature_store_descriptor: A descriptor identifying the feature store containing the key - :raises ValueError: if building fails + :return: The ModelKey + :raises ValueError: If building fails """ try: model_key = data_references_capnp.ModelKey.new_message() @@ -158,7 +161,8 @@ def build_torch_request_attributes( Builds a new TorchRequestAttributes message with the provided tensor type. :param tensor_type: Type of the tensor passed in - :raises ValueError: if building fails + :return: The TorchRequestAttributes + :raises ValueError: If building fails """ try: attributes = request_attributes_capnp.TorchRequestAttributes.new_message() @@ -177,7 +181,8 @@ def build_tf_request_attributes( :param name: Name of the tensor :param tensor_type: Type of the tensor passed in - :raises ValueError: if building fails + :return: The TensorFlowRequestAttributes + :raises ValueError: If building fails """ try: attributes = ( @@ -195,6 +200,8 @@ def build_torch_response_attributes() -> ( ): """ Builds a new TorchResponseAttributes message. + + :return: The TorchResponseAttributes """ return response_attributes_capnp.TorchResponseAttributes.new_message() @@ -204,6 +211,8 @@ def build_tf_response_attributes() -> ( ): """ Builds a new TensorFlowResponseAttributes message. + + :return: The TensorFlowResponseAttributes """ return response_attributes_capnp.TensorFlowResponseAttributes.new_message() @@ -217,7 +226,7 @@ def _assign_model( :param request: Request being built :param model: Model to be assigned - :raises ValueError: if building fails + :raises ValueError: If building fails """ try: class_name = model.schema.node.displayName.split(":")[-1] # type: ignore @@ -240,7 +249,7 @@ def _assign_reply_channel( :param request: Request being built :param reply_channel: Reply channel to be assigned - :raises ValueError: if building fails + :raises ValueError: If building fails """ try: request.replyChannel.descriptor = reply_channel @@ -260,7 +269,7 @@ def _assign_inputs( :param request: Request being built :param inputs: Inputs to be assigned - :raises ValueError: if building fails + :raises ValueError: If building fails """ try: if inputs: @@ -286,7 +295,7 @@ def _assign_outputs( :param request: Request being built :param outputs: Outputs to be assigned - :raises ValueError: if building fails + :raises ValueError: If building fails """ try: request.output = outputs @@ -304,7 +313,7 @@ def _assign_output_descriptors( :param request: Request being built :param output_descriptors: Output descriptors to be assigned - :raises ValueError: if building fails + :raises ValueError: If building fails """ try: request.outputDescriptors = output_descriptors @@ -327,7 +336,7 @@ def _assign_custom_request_attributes( :param request: Request being built :param custom_attrs: Custom attributes to be assigned - :raises ValueError: if building fails + :raises ValueError: If building fails """ try: if custom_attrs is None: @@ -374,6 +383,7 @@ def build_request( :param outputs: Outputs to be assigned to request :param output_descriptors: Output descriptors to be assigned to request :param custom_attributes: Custom attributes to be assigned to request + :return: The Request """ request = request_capnp.Request.new_message() MessageHandler._assign_reply_channel(request, reply_channel) @@ -390,6 +400,7 @@ def serialize_request(request: request_capnp.RequestBuilder) -> bytes: Serializes a built request message. :param request: Request to be serialized + :return: Serialized request bytes """ return request.to_bytes() @@ -398,7 +409,8 @@ def deserialize_request(request_bytes: bytes) -> request_capnp.Request: """ Deserializes a serialized request message. - :param request_bytes: Bytes to be deserialized into a Request + :param request_bytes: Bytes to be deserialized into a request + :return: Deserialized request """ bytes_message = request_capnp.Request.from_bytes( request_bytes, traversal_limit_in_words=2**63 @@ -416,7 +428,7 @@ def _assign_status( :param response: Response being built :param status: Status to be assigned - :raises ValueError: if building fails + :raises ValueError: If building fails """ try: response.status = status @@ -430,7 +442,7 @@ def _assign_message(response: response_capnp.Response, message: str) -> None: :param response: Response being built :param message: Message to be assigned - :raises ValueError: if building fails + :raises ValueError: If building fails """ try: response.message = message @@ -451,7 +463,7 @@ def _assign_result( :param response: Response being built :param result: Result to be assigned - :raises ValueError: if building fails + :raises ValueError: If building fails """ try: if result: @@ -482,7 +494,7 @@ def _assign_custom_response_attributes( :param response: Response being built :param custom_attrs: Custom attributes to be assigned - :raises ValueError: if building fails + :raises ValueError: If building fails """ try: if custom_attrs is None: @@ -524,6 +536,7 @@ def build_response( :param message: Message to be assigned to response :param result: Result to be assigned to response :param custom_attributes: Custom attributes to be assigned to response + :return: The Response """ response = response_capnp.Response.new_message() MessageHandler._assign_status(response, status) @@ -536,6 +549,9 @@ def build_response( def serialize_response(response: response_capnp.ResponseBuilder) -> bytes: """ Serializes a built response message. + + :param response: Response to be serialized + :return: Serialized response bytes """ return response.to_bytes() @@ -543,6 +559,9 @@ def serialize_response(response: response_capnp.ResponseBuilder) -> bytes: def deserialize_response(response_bytes: bytes) -> response_capnp.Response: """ Deserializes a serialized response message. + + :param response_bytes: Bytes to be deserialized into a response + :return: Deserialized response """ bytes_message = response_capnp.Response.from_bytes( response_bytes, traversal_limit_in_words=2**63 From 61ab71d6e92889f685e6529614043b018b59c272 Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:17:19 -0700 Subject: [PATCH 26/60] Update MLI docstrings part 2 (#699) Part 2 of updating docstrings in the MLI. [ committed by @AlyssaCote ] [ reviewed by @al-rigazzi ] --- doc/changelog.md | 2 +- smartsim/_core/mli/comm/channel/channel.py | 24 ++- .../_core/mli/comm/channel/dragon_channel.py | 44 ++++-- smartsim/_core/mli/comm/channel/dragon_fli.py | 30 ++-- .../infrastructure/control/device_manager.py | 4 +- .../infrastructure/control/error_handling.py | 2 +- .../control/request_dispatcher.py | 36 ++--- .../infrastructure/control/worker_manager.py | 7 +- .../mli/infrastructure/environment_loader.py | 16 +- .../storage/backbone_feature_store.py | 144 ++++++++++-------- .../storage/dragon_feature_store.py | 31 ++-- .../infrastructure/storage/feature_store.py | 81 +++++----- .../_core/mli/infrastructure/worker/worker.py | 26 ++-- smartsim/_core/mli/message_handler.py | 30 ++-- 14 files changed, 268 insertions(+), 209 deletions(-) diff --git a/doc/changelog.md b/doc/changelog.md index 17fed285cc..6e53070a19 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,7 +13,7 @@ Jump to: Description -- Update docstrings +- Update docstrings - Implement asynchronous notifications for shared data - Filenames conform to snake case - Update SmartSim environment variables using new naming convention diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index 09d3ac62b7..bfe7920891 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -37,26 +37,34 @@ class CommChannelBase(ABC): """Base class for abstracting a message passing mechanism""" def __init__(self, descriptor: t.Union[str, bytes]) -> None: - """Initialize the CommChannel instance""" + """Initialize the CommChannel instance. + + :param descriptor: Channel descriptor + """ self._descriptor = descriptor @abstractmethod def send(self, value: bytes, timeout: float = 0) -> None: - """Send a message through the underlying communication channel + """Send a message through the underlying communication channel. - :param timeout: maximum time to wait (in seconds) for messages to send - :param value: The value to send""" + :param timeout: Maximum time to wait (in seconds) for messages to send + :param value: The value to send + """ @abstractmethod def recv(self, timeout: float = 0) -> t.List[bytes]: - """Receives message(s) through the underlying communication channel + """Receives message(s) through the underlying communication channel. - :param timeout: maximum time to wait (in seconds) for messages to arrive - :returns: the received message""" + :param timeout: Maximum time to wait (in seconds) for messages to arrive + :returns: The received message + """ @property def descriptor(self) -> bytes: - """Return the channel descriptor for the underlying dragon channel""" + """Return the channel descriptor for the underlying dragon channel. + + :returns: Byte encoded channel descriptor + """ if isinstance(self._descriptor, str): return base64.b64decode(self._descriptor.encode("utf-8")) return self._descriptor diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index e902ddadde..710134357c 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -48,11 +48,12 @@ def create_local(capacity: int = 0) -> dch.Channel: - """Creates a Channel attached to the local memory pool + """Creates a Channel attached to the local memory pool. - :param capacity: the number of events the channel can buffer; uses the default + :param capacity: The number of events the channel can buffer; uses the default buffer size `DEFAULT_CHANNEL_BUFFER_SIZE` when not supplied - :returns: the instantiated channel""" + :returns: The instantiated channel + """ pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd)) channel: t.Optional[dch.Channel] = None offset = 0 @@ -83,13 +84,13 @@ def create_local(capacity: int = 0) -> dch.Channel: class DragonCommChannel(cch.CommChannelBase): - """Passes messages by writing to a Dragon channel""" + """Passes messages by writing to a Dragon channel.""" def __init__(self, channel: "dch.Channel") -> None: - """Initialize the DragonCommChannel instance + """Initialize the DragonCommChannel instance. - :param channel: a channel to use for communications - :param recv_timeout: a default timeout to apply to receive calls""" + :param channel: A channel to use for communications + """ serialized_ch = channel.serialize() descriptor = base64.b64encode(serialized_ch).decode("utf-8") super().__init__(descriptor) @@ -97,23 +98,28 @@ def __init__(self, channel: "dch.Channel") -> None: @property def channel(self) -> "dch.Channel": - """The underlying communication channel""" + """The underlying communication channel. + + :returns: The channel + """ return self._channel def send(self, value: bytes, timeout: float = 0.001) -> None: - """Send a message throuh the underlying communication channel + """Send a message through the underlying communication channel. :param value: The value to send - :param timeout: maximum time to wait (in seconds) for messages to send""" + :param timeout: Maximum time to wait (in seconds) for messages to send + """ with self._channel.sendh(timeout=timeout) as sendh: sendh.send_bytes(value) logger.debug(f"DragonCommChannel {self.descriptor!r} sent message") def recv(self, timeout: float = 0.001) -> t.List[bytes]: - """Receives message(s) through the underlying communication channel + """Receives message(s) through the underlying communication channel. - :param timeout: maximum time to wait (in seconds) for messages to arrive - :returns: the received message""" + :param timeout: Maximum time to wait (in seconds) for messages to arrive + :returns: The received message(s) + """ with self._channel.recvh(timeout=timeout) as recvh: messages: t.List[bytes] = [] @@ -133,7 +139,11 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]: def descriptor_string(self) -> str: """Return the channel descriptor for the underlying dragon channel as a string. Automatically performs base64 encoding to ensure the - string can be used in a call to `from_descriptor`""" + string can be used in a call to `from_descriptor`. + + :returns: String representation of channel descriptor + :raises ValueError: If unable to convert descriptor to a string + """ if isinstance(self._descriptor, str): return self._descriptor @@ -147,11 +157,13 @@ def from_descriptor( cls, descriptor: t.Union[bytes, str], ) -> "DragonCommChannel": - """A factory method that creates an instance from a descriptor string + """A factory method that creates an instance from a descriptor string. :param descriptor: The descriptor that uniquely identifies the resource. Output from `descriptor_string` is correctly encoded. - :returns: An attached DragonCommChannel""" + :returns: An attached DragonCommChannel + :raises SmartSimError: If creation of comm channel fails + """ try: utf8_descriptor: t.Union[str, bytes] = descriptor if isinstance(descriptor, str): diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index a5e5f9f350..12ae727af7 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -45,7 +45,7 @@ class DragonFLIChannel(cch.CommChannelBase): - """Passes messages by writing to a Dragon FLI Channel""" + """Passes messages by writing to a Dragon FLI Channel.""" def __init__( self, @@ -53,11 +53,11 @@ def __init__( sender_supplied: bool = True, buffer_size: int = 0, ) -> None: - """Initialize the DragonFLIChannel instance + """Initialize the DragonFLIChannel instance. - :param fli_desc: the descriptor of the FLI channel to attach - :param sender_supplied: flag indicating if the FLI uses sender-supplied streams - :param buffer_size: maximum number of sent messages that can be buffered + :param fli_desc: The descriptor of the FLI channel to attach + :param sender_supplied: Flag indicating if the FLI uses sender-supplied streams + :param buffer_size: Maximum number of sent messages that can be buffered """ super().__init__(fli_desc) self._fli: "fli" = fli.FLInterface.attach(fli_desc) @@ -66,19 +66,21 @@ def __init__( ) def send(self, value: bytes, timeout: float = 0.001) -> None: - """Send a message through the underlying communication channel + """Send a message through the underlying communication channel. - :param timeout: maximum time to wait (in seconds) for messages to send - :param value: The value to send""" + :param timeout: Maximum time to wait (in seconds) for messages to send + :param value: The value to send + """ with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: sendh.send_bytes(value, timeout=timeout) logger.debug(f"DragonFLIChannel {self.descriptor!r} sent message") def recv(self, timeout: float = 0.001) -> t.List[bytes]: - """Receives message(s) through the underlying communication channel + """Receives message(s) through the underlying communication channel. - :param timeout: maximum time to wait (in seconds) for messages to arrive - :returns: the received message""" + :param timeout: Maximum time to wait (in seconds) for messages to arrive + :returns: The received message(s) + """ messages = [] eot = False with self._fli.recvh(timeout=timeout) as recvh: @@ -98,10 +100,12 @@ def from_descriptor( cls, descriptor: str, ) -> "DragonFLIChannel": - """A factory method that creates an instance from a descriptor string + """A factory method that creates an instance from a descriptor string. :param descriptor: The descriptor that uniquely identifies the resource - :returns: An attached DragonFLIChannel""" + :returns: An attached DragonFLIChannel + :raises Exception: If creation of DragonFLIChanenel fails + """ try: return DragonFLIChannel( fli_desc=base64.b64decode(descriptor), diff --git a/smartsim/_core/mli/infrastructure/control/device_manager.py b/smartsim/_core/mli/infrastructure/control/device_manager.py index be0a05d064..10531e701c 100644 --- a/smartsim/_core/mli/infrastructure/control/device_manager.py +++ b/smartsim/_core/mli/infrastructure/control/device_manager.py @@ -49,7 +49,7 @@ def __init__(self, name: str) -> None: def name(self) -> str: """The identifier of the device represented by this object - :return: Name used by the toolkit to identify this device + :returns: Name used by the toolkit to identify this device """ return self._name @@ -143,7 +143,7 @@ def get_device( :param batch: The batch of requests :param feature_store: The feature store on which part of the data needed by the request may be stored - :return: A generator yielding the device + :returns: A generator yielding the device """ model_in_request = batch.has_raw_model diff --git a/smartsim/_core/mli/infrastructure/control/error_handling.py b/smartsim/_core/mli/infrastructure/control/error_handling.py index 30cffb8c6b..9de97b9b7e 100644 --- a/smartsim/_core/mli/infrastructure/control/error_handling.py +++ b/smartsim/_core/mli/infrastructure/control/error_handling.py @@ -43,7 +43,7 @@ def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: :param status: Status enum :param message: Status message - :return: Failure response + :returns: Failure response """ return MessageHandler.build_response( status=status, diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index 2b1bf58952..07574b64ab 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -81,7 +81,7 @@ def __init__( """Time in seconds that has to be waited before flushing a non-full queue. The time of the first item put is 0 seconds.""" self._batch_size = batch_size - """Total capacity of the queue.""" + """Total capacity of the queue""" self._first_put: t.Optional[float] = None """Time at which the first item was put on the queue""" self._disposable = False @@ -96,7 +96,7 @@ def __init__( def uid(self) -> str: """ID of this queue. - :return: Queue ID + :returns: Queue ID """ return self._uid @@ -104,7 +104,7 @@ def uid(self) -> str: def model_id(self) -> ModelIdentifier: """Key of the model which needs to be run on the queued requests. - :return: Model key + :returns: Model key """ return self._model_id @@ -129,7 +129,7 @@ def put( def _elapsed_time(self) -> float: """Time elapsed since the first item was put on this queue. - :return: Time elapsed + :returns: Time elapsed """ if self.empty() or self._first_put is None: return 0 @@ -139,7 +139,7 @@ def _elapsed_time(self) -> float: def ready(self) -> bool: """Check if the queue can be flushed. - :return: True if the queue can be flushed, False otherwise + :returns: True if the queue can be flushed, False otherwise """ if self.empty(): return False @@ -151,21 +151,22 @@ def ready(self) -> bool: return self.full() or timed_out def make_disposable(self) -> None: - """Set this queue as disposable, and never use it again after it gets flushed""" + """Set this queue as disposable, and never use it again after it gets + flushed.""" self._disposable = True @property def can_be_removed(self) -> bool: """Determine whether this queue can be deleted and garbage collected. - :return: True if queue can be removed, False otherwise + :returns: True if queue can be removed, False otherwise """ return self.empty() and self._disposable def flush(self) -> list[t.Any]: """Get all requests from queue. - :return: Requests waiting to be executed + :returns: Requests waiting to be executed """ num_items = self.qsize() self._first_put = None @@ -181,7 +182,7 @@ def flush(self) -> list[t.Any]: def full(self) -> bool: """Check if the queue has reached its maximum capacity. - :return: True if the queue has reached its maximum capacity, + :returns: True if the queue has reached its maximum capacity, False otherwise """ if self._disposable: @@ -191,7 +192,7 @@ def full(self) -> bool: def empty(self) -> bool: """Check if the queue is empty. - :return: True if the queue has 0 elements, False otherwise + :returns: True if the queue has 0 elements, False otherwise """ return self.qsize() == 0 @@ -228,7 +229,7 @@ def __init__( self._batch_timeout = batch_timeout """Time in seconds that has to be waited before flushing a non-full queue""" self._batch_size = batch_size - """Total capacity of each batch queue.""" + """Total capacity of each batch queue""" incoming_channel = config_loader.get_queue() if incoming_channel is None: raise SmartSimError("No incoming channel for dispatcher") @@ -327,7 +328,8 @@ def _validate_request(self, request: InferenceRequest) -> bool: """Ensure the request can be processed. :param request: The request to validate - :return: False if the request fails any validation checks, True otherwise""" + :returns: False if the request fails any validation checks, True otherwise + """ checks = [ self._check_feature_stores(request), self._check_model(request), @@ -339,8 +341,7 @@ def _validate_request(self, request: InferenceRequest) -> bool: def _on_iteration(self) -> None: """This method is executed repeatedly until ``Service`` shutdown - conditions are satisfied and cooldown is elapsed. - """ + conditions are satisfied and cooldown is elapsed.""" try: self._perf_timer.is_active = True bytes_list: t.List[bytes] = self._incoming_channel.recv() @@ -414,7 +415,7 @@ def remove_queues(self) -> None: def task_queue(self) -> DragonQueue: """The queue on which batched requests are placed. - :return: The queue + :returns: The queue """ return self._outgoing_queue @@ -469,8 +470,7 @@ def dispatch(self, request: InferenceRequest) -> None: def flush_requests(self) -> None: """Get all requests from queues which are ready to be flushed. Place all - available request batches in the outgoing queue. - """ + available request batches in the outgoing queue.""" for queue_list in self._queues.values(): for queue in queue_list: if queue.ready: @@ -529,7 +529,7 @@ def flush_requests(self) -> None: def _can_shutdown(self) -> bool: """Determine whether the Service can be shut down. - :return: False + :returns: False """ return False diff --git a/smartsim/_core/mli/infrastructure/control/worker_manager.py b/smartsim/_core/mli/infrastructure/control/worker_manager.py index 8136be5974..0dcfc89d59 100644 --- a/smartsim/_core/mli/infrastructure/control/worker_manager.py +++ b/smartsim/_core/mli/infrastructure/control/worker_manager.py @@ -149,8 +149,8 @@ def _validate_batch(self, batch: RequestBatch) -> bool: """Ensure the request can be processed. :param batch: The batch of requests to validate - :return: False if the request fails any validation checks, True otherwise""" - + :returns: False if the request fails any validation checks, True otherwise + """ if batch is None or len(batch.requests) == 0: return False @@ -161,7 +161,6 @@ def _validate_batch(self, batch: RequestBatch) -> bool: def _on_iteration(self) -> None: """Executes calls to the machine learning worker implementation to complete the inference pipeline.""" - pre_batch_time = time.perf_counter() try: batch: RequestBatch = self._dispatcher_queue.get(timeout=0.0001) @@ -311,7 +310,7 @@ def _on_iteration(self) -> None: def _can_shutdown(self) -> bool: """Determine if the service can be shutdown. - :return: True when criteria to shutdown the service are met, False otherwise + :returns: True when criteria to shutdown the service are met, False otherwise """ # todo: determine shutdown criteria # will we receive a completion message? diff --git a/smartsim/_core/mli/infrastructure/environment_loader.py b/smartsim/_core/mli/infrastructure/environment_loader.py index 364a3ebc9d..02043fbd80 100644 --- a/smartsim/_core/mli/infrastructure/environment_loader.py +++ b/smartsim/_core/mli/infrastructure/environment_loader.py @@ -53,7 +53,8 @@ def __init__( :param callback_factory: A factory method that produces a callback channel given a descriptor :param queue_factory: A factory method that produces a queue - channel given a descriptor""" + channel given a descriptor + """ self.queue: t.Optional[CommChannelBase] = None """The attached incoming event queue channel""" self.backbone: t.Optional[FeatureStore] = None @@ -69,10 +70,12 @@ def __init__( def get_backbone(self) -> t.Optional[FeatureStore]: """Attach to the backbone feature store using the descriptor found in - an environment variable. The backbone is a standalone, system-created - feature store used to share internal information among MLI components. + the environment variable `_SMARTSIM_INFRA_BACKBONE`. The backbone is + a standalone, system-created feature store used to share internal + information among MLI components. - :returns: The attached feature store via _SMARTSIM_INFRA_BACKBONE""" + :returns: The attached feature store via `_SMARTSIM_INFRA_BACKBONE` + """ descriptor = os.getenv("_SMARTSIM_INFRA_BACKBONE", "") if not descriptor: @@ -88,9 +91,10 @@ def get_backbone(self) -> t.Optional[FeatureStore]: def get_queue(self) -> t.Optional[CommChannelBase]: """Attach to a queue-like communication channel using the descriptor - found in an environment variable. + found in the environment variable `_SMARTSIM_REQUEST_QUEUE`. - :returns: The attached queue specified via `_SMARTSIM_REQUEST_QUEUE`""" + :returns: The attached queue specified via `_SMARTSIM_REQUEST_QUEUE` + """ descriptor = os.getenv("_SMARTSIM_REQUEST_QUEUE", "") if not descriptor: diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index e3ea9f918b..cda31dde67 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -52,25 +52,28 @@ # and passes it wherever they need a FeatureStore? class BackboneFeatureStore(DragonFeatureStore): """A DragonFeatureStore wrapper with utility methods for accessing shared - information stored in the MLI backbone feature store""" + information stored in the MLI backbone feature store.""" MLI_NOTIFY_CONSUMERS = "_SMARTSIM_MLI_NOTIFY_CONSUMERS" def __init__( self, storage: "dragon_ddict.DDict", allow_reserved_writes: bool = False ) -> None: - """Initialize the DragonFeatureStore instance + """Initialize the DragonFeatureStore instance. :param storage: A distributed dictionary to be used as the underlying - storage mechanism of the feature store""" + storage mechanism of the feature store + :param allow_reserved_writes: Whether reserved writes are allowed + """ super().__init__(storage) self._enable_reserved_writes = allow_reserved_writes @property def notification_channels(self) -> t.Sequence[str]: - """Retrieve descriptors for all registered MLI notification channels + """Retrieve descriptors for all registered MLI notification channels. - :returns: the list of descriptors""" + :returns: The list of descriptors + """ if "_SMARTSIM_MLI_NOTIFY_CONSUMERS" in self: stored_consumers = self[self.MLI_NOTIFY_CONSUMERS] return str(stored_consumers).split(",") @@ -78,14 +81,15 @@ def notification_channels(self) -> t.Sequence[str]: @notification_channels.setter def notification_channels(self, values: t.Sequence[str]) -> None: - """Set the notification channels to be sent events + """Set the notification channels to be sent events. - :param values: the list of channel descriptors to save""" + :param values: The list of channel descriptors to save + """ self[self.MLI_NOTIFY_CONSUMERS] = ",".join([str(value) for value in values]) class EventCategory(str, enum.Enum): - """Predefined event types raised by SmartSim backend""" + """Predefined event types raised by SmartSim backend.""" CONSUMER_CREATED: str = "consumer-created" FEATURE_STORE_WRITTEN: str = "feature-store-written" @@ -93,7 +97,7 @@ class EventCategory(str, enum.Enum): @dataclass class EventBase: - """Core API for an event""" + """Core API for an event.""" # todo: shift eventing code to: infrastructure / event / event.py category: EventCategory @@ -105,41 +109,42 @@ class EventBase: def __bytes__(self) -> bytes: """Default conversion to bytes for an event required to publish - messages using byte-oriented communication channels + messages using byte-oriented communication channels. - :returns: this entity encoded as bytes""" + :returns: This entity encoded as bytes""" return pickle.dumps(self) def __str__(self) -> str: - """Convert the event to a string + """Convert the event to a string. - :returns: a string representation of this instance""" + :returns: A string representation of this instance""" return f"{self.uid}|{self.category}" class OnCreateConsumer(EventBase): - """Publish this event when a new event consumer registration is required""" + """Publish this event when a new event consumer registration is required.""" descriptor: str """Descriptor of the comm channel exposed by the consumer""" def __init__(self, descriptor: str) -> None: - """Initialize the event + """Initialize the OnCreateConsumer event. - :param descriptor: descriptor of the comm channel exposed by the consumer + :param descriptor: Descriptor of the comm channel exposed by the consumer """ super().__init__(EventCategory.CONSUMER_CREATED, str(uuid.uuid4())) self.descriptor = descriptor def __str__(self) -> str: - """Convert the event to a string + """Convert the event to a string. - :returns: a string representation of this instance""" + :returns: A string representation of this instance + """ return f"{str(super())}|{self.descriptor}" class OnWriteFeatureStore(EventBase): - """Publish this event when a feature store key is written""" + """Publish this event when a feature store key is written.""" descriptor: str """The descriptor of the feature store where the write occurred""" @@ -148,7 +153,7 @@ class OnWriteFeatureStore(EventBase): """The key identifying where the write occurred""" def __init__(self, descriptor: str, key: str) -> None: - """Initialize the event + """Initialize the OnWriteFeatureStore event. :param descriptor: The descriptor of the feature store where the write occurred :param key: The key identifying where the write occurred @@ -158,34 +163,36 @@ def __init__(self, descriptor: str, key: str) -> None: self.key = key def __str__(self) -> str: - """Convert the event to a string + """Convert the event to a string. - :returns: a string representation of this instance""" + :returns: A string representation of this instance + """ return f"{str(super())}|{self.descriptor}|{self.key}" class EventProducer(t.Protocol): - """Core API of a class that publishes events""" + """Core API of a class that publishes events.""" def send(self, event: EventBase, timeout: float = 0.001) -> int: - """The send operation + """The send operation. - :param event: the event to send - :param timeout: maximum time to wait (in seconds) for messages to send""" + :param event: The event to send + :param timeout: Maximum time to wait (in seconds) for messages to send + """ class EventBroadcaster: - """Performs fan-out publishing of system events""" + """Performs fan-out publishing of system events.""" def __init__( self, backbone: BackboneFeatureStore, channel_factory: t.Optional[t.Callable[[str], CommChannelBase]] = None, ) -> None: - """Initialize the EventPublisher instance + """Initialize the EventPublisher instance. - :param backbone: the MLI backbone feature store - :param channel_factory: factory method to construct new channel instances + :param backbone: The MLI backbone feature store + :param channel_factory: Factory method to construct new channel instances """ self._backbone = backbone """The backbone feature store used to retrieve consumer descriptors""" @@ -197,7 +204,7 @@ def __init__( """A mapping of instantiated channels that can be re-used. Automatically calls the channel factory if a descriptor is not already in the collection""" self._event_buffer: t.Deque[bytes] = deque() - """A buffer for storing events when a consumer list is not found.""" + """A buffer for storing events when a consumer list is not found""" self._descriptors: t.Set[str] """Stores the most recent list of broadcast consumers. Updated automatically on each broadcast""" @@ -206,15 +213,19 @@ def __init__( @property def num_buffered(self) -> int: - """Return the number of events currently buffered to send""" + """Return the number of events currently buffered to send. + + :returns: Number of buffered events + """ return len(self._event_buffer) def _save_to_buffer(self, event: EventBase) -> None: """Places a serialized event in the buffer to be sent once a consumer list is available. - :param event: The event to serialize and buffer""" - + :param event: The event to serialize and buffer + :raises ValueError: If the event cannot be serialized + """ try: event_bytes = bytes(event) self._event_buffer.append(event_bytes) @@ -222,7 +233,7 @@ def _save_to_buffer(self, event: EventBase) -> None: raise ValueError(f"Unable to serialize event from {self._uid}") from ex def _log_broadcast_start(self) -> None: - """Logs broadcast statistics""" + """Logs broadcast statistics.""" num_events = len(self._event_buffer) num_copies = len(self._descriptors) logger.debug( @@ -231,7 +242,7 @@ def _log_broadcast_start(self) -> None: def _prune_unused_consumers(self) -> None: """Performs maintenance on the channel cache by pruning any channel - that has been removed from the consumers list""" + that has been removed from the consumers list.""" active_consumers = set(self._descriptors) current_channels = set(self._channel_cache.keys()) @@ -248,11 +259,12 @@ def _prune_unused_consumers(self) -> None: ) def _get_comm_channel(self, descriptor: str) -> CommChannelBase: - """Helper method to build and cache a comm channel + """Helper method to build and cache a comm channel. - :param descriptor: the descriptor to pass to the channel factory - :returns: the instantiated channel - :raises SmartSimError: if the channel fails to build""" + :param descriptor: The descriptor to pass to the channel factory + :returns: The instantiated channel + :raises SmartSimError: If the channel fails to build + """ comm_channel = self._channel_cache[descriptor] if comm_channel is not None: return comm_channel @@ -272,12 +284,10 @@ def _get_comm_channel(self, descriptor: str) -> CommChannelBase: def _broadcast(self, timeout: float = 0.001) -> int: """Broadcasts all buffered events to registered event consumers. - :param timeout: maximum time to wait (in seconds) for messages to send - :return: the number of events broadcasted to consumers - :raises ValueError: if event serialization fails - :raises KeyError: if channel fails to attach using registered descriptors - :raises SmartSimError: if broadcasting fails""" - + :param timeout: Maximum time to wait (in seconds) for messages to send + :returns: The number of events broadcasted to consumers + :raises SmartSimError: If broadcasting fails + """ # allow descriptors to be empty since events are buffered self._descriptors = set(x for x in self._backbone.notification_channels if x) if not self._descriptors: @@ -316,14 +326,15 @@ def _broadcast(self, timeout: float = 0.001) -> int: def send(self, event: EventBase, timeout: float = 0.001) -> int: """Implementation of `send` method of the `EventPublisher` protocol. Publishes - the supplied event to all registered broadcast consumers - - :param event: an event to publish - :param timeout: maximum time to wait (in seconds) for messages to send - :returns: the number of events successfully published - :raises ValueError: if event serialization fails - :raises KeyError: if channel fails to attach using registered descriptors - :raises SmartSimError: if any unexpected error occurs during send""" + the supplied event to all registered broadcast consumers. + + :param event: An event to publish + :param timeout: Maximum time to wait (in seconds) for messages to send + :returns: The number of events successfully published + :raises ValueError: If event serialization fails + :raises KeyError: If channel fails to attach using registered descriptors + :raises SmartSimError: If any unexpected error occurs during send + """ try: self._save_to_buffer(event) return self._broadcast(timeout) @@ -334,7 +345,7 @@ def send(self, event: EventBase, timeout: float = 0.001) -> int: class EventConsumer: - """Reads system events published to a communications channel""" + """Reads system events published to a communications channel.""" def __init__( self, @@ -343,14 +354,16 @@ def __init__( filters: t.Optional[t.List[EventCategory]] = None, batch_timeout: t.Optional[float] = None, ) -> None: - """Initialize the EventConsumer instance + """Initialize the EventConsumer instance. - :param comm_channel: communications channel to listen to for events - :param backbone: the MLI backbone feature store - :param filters: a list of event types to deliver. when empty, all + :param comm_channel: Communications channel to listen to for events + :param backbone: The MLI backbone feature store + :param filters: A list of event types to deliver. when empty, all events will be delivered - :param timeout: maximum time to wait for messages to arrive; may be overridden - on individual calls to `receive`""" + :param timeout: Maximum time to wait for messages to arrive; may be overridden + on individual calls to `receive` + :raises ValueError: If batch_timeout <= 0 + """ if batch_timeout is not None and batch_timeout <= 0: raise ValueError("batch_timeout must be a non-zero, positive value") @@ -362,12 +375,13 @@ def __init__( def receive( self, filters: t.Optional[t.List[EventCategory]] = None, timeout: float = 0 ) -> t.List[EventBase]: - """Receives available published event(s) + """Receives available published event(s). - :param filters: additional filters to add to the global filters configured + :param filters: Additional filters to add to the global filters configured on the EventConsumer instance - :param timeout: maximum time to wait for messages to arrive - :returns: a list of events that pass any configured filters""" + :param timeout: Maximum time to wait for messages to arrive + :returns: A list of events that pass any configured filters + """ if filters is None: filters = [] diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py index c322c34e2c..f1e22e2449 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py @@ -40,13 +40,14 @@ class DragonFeatureStore(FeatureStore): - """A feature store backed by a dragon distributed dictionary""" + """A feature store backed by a dragon distributed dictionary.""" def __init__(self, storage: "dragon_ddict.DDict") -> None: - """Initialize the DragonFeatureStore instance + """Initialize the DragonFeatureStore instance. :param storage: A distributed dictionary to be used as the underlying - storage mechanism of the feature store""" + storage mechanism of the feature store + """ if isinstance(storage, dragon_ddict.DDict): descriptor = str(storage.serialize()) else: @@ -56,27 +57,30 @@ def __init__(self, storage: "dragon_ddict.DDict") -> None: self._storage: t.Dict[str, t.Union[str, bytes]] = storage def _get(self, key: str) -> t.Union[str, bytes]: - """Retrieve a value from the underlying storage mechanism + """Retrieve a value from the underlying storage mechanism. :param key: The unique key that identifies the resource - :returns: the value identified by the key - :raises KeyError: if the key has not been used to store a value""" + :returns: The value identified by the key + :raises KeyError: If the key has not been used to store a value + """ return self._storage[key] def _set(self, key: str, value: t.Union[str, bytes]) -> None: - """Store a value into the underlying storage mechanism + """Store a value into the underlying storage mechanism. :param key: The unique key that identifies the resource :param value: The value to store - :returns: the value identified by the key - :raises KeyError: if the key has not been used to store a value""" + :returns: The value identified by the key + :raises KeyError: If the key has not been used to store a value + """ self._storage[key] = value def _contains(self, key: str) -> bool: - """Determine if the storage mechanism contains a given key + """Determine if the storage mechanism contains a given key. :param key: The unique key that identifies the resource - :returns: True if the key is defined, False otherwise""" + :returns: True if the key is defined, False otherwise + """ return key in self._storage @classmethod @@ -84,11 +88,12 @@ def from_descriptor( cls, descriptor: str, ) -> "DragonFeatureStore": - """A factory method that creates an instance from a descriptor string + """A factory method that creates an instance from a descriptor string. :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached DragonFeatureStore - :raises SmartSimError: if attachment to DragonFeatureStore fails""" + :raises SmartSimError: If attachment to DragonFeatureStore fails + """ try: return DragonFeatureStore(dragon_ddict.DDict.attach(descriptor)) except Exception as ex: diff --git a/smartsim/_core/mli/infrastructure/storage/feature_store.py b/smartsim/_core/mli/infrastructure/storage/feature_store.py index 04e7134427..ba866d93d1 100644 --- a/smartsim/_core/mli/infrastructure/storage/feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/feature_store.py @@ -37,7 +37,7 @@ class ReservedKeys(str, enum.Enum): """Contains constants used to identify all featurestore keys that - may not be to used by users. Avoids overwriting system data""" + may not be to used by users. Avoids overwriting system data.""" MLI_NOTIFY_CONSUMERS = "_SMARTSIM_MLI_NOTIFY_CONSUMERS" """Storage location for the list of registered consumers that will receive @@ -45,10 +45,11 @@ class ReservedKeys(str, enum.Enum): @classmethod def contains(cls, value: str) -> bool: - """Convert a string representation into an enumeration member + """Convert a string representation into an enumeration member. - :param value: the string to convert - :returns: the enumeration member if the conversion succeeded, otherwise None""" + :param value: The string to convert + :returns: The enumeration member if the conversion succeeded, otherwise None + """ try: cls(value) except ValueError: @@ -59,7 +60,7 @@ def contains(cls, value: str) -> bool: @dataclass(frozen=True) class FeatureStoreKey: - """A key,descriptor pair enabling retrieval of an item from a feature store""" + """A key,descriptor pair enabling retrieval of an item from a feature store.""" key: str """The unique key of an item in a feature store""" @@ -67,9 +68,9 @@ class FeatureStoreKey: """The unique identifier of the feature store containing the key""" def __post_init__(self) -> None: - """Ensure the key and descriptor have at least one character + """Ensure the key and descriptor have at least one character. - :raises ValueError: if key or descriptor are empty strings + :raises ValueError: If key or descriptor are empty strings """ if len(self.key) < 1: raise ValueError("Key must have at least one character.") @@ -79,14 +80,15 @@ def __post_init__(self) -> None: class FeatureStore(ABC): """Abstract base class providing the common interface for retrieving - values from a feature store implementation""" + values from a feature store implementation.""" def __init__(self, descriptor: str, allow_reserved_writes: bool = False) -> None: - """Initialize the feature store + """Initialize the feature store. - :param descriptor: the stringified version of a storage descriptor - :param allow_reserved_writes: override the default behavior of blocking - writes to reserved keys""" + :param descriptor: The stringified version of a storage descriptor + :param allow_reserved_writes: Override the default behavior of blocking + writes to reserved keys + """ self._enable_reserved_writes = allow_reserved_writes """Flag used to ensure that any keys written by the system to a feature store are not overwritten by user code. Disabled by default. Subclasses must set the @@ -97,10 +99,11 @@ def __init__(self, descriptor: str, allow_reserved_writes: bool = False) -> None def _check_reserved(self, key: str) -> None: """A utility method used to verify access to write to a reserved key - in the FeatureStore. Used by subclasses in __setitem___ implementations + in the FeatureStore. Used by subclasses in __setitem___ implementations. - :param key: a key to compare to the reserved keys - :raises SmartSimError: if the key is reserved""" + :param key: A key to compare to the reserved keys + :raises SmartSimError: If the key is reserved + """ if not self._enable_reserved_writes and ReservedKeys.contains(key): raise SmartSimError( "Use of reserved key denied. " @@ -108,9 +111,12 @@ def _check_reserved(self, key: str) -> None: ) def __getitem__(self, key: str) -> t.Union[str, bytes]: - """Retrieve an item using key + """Retrieve an item using key. - :param key: Unique key of an item to retrieve from the feature store""" + :param key: Unique key of an item to retrieve from the feature store + :returns: An item in the FeatureStore + :raises SmartSimError: If retrieving fails + """ try: return self._get(key) except KeyError as ex: @@ -122,10 +128,11 @@ def __getitem__(self, key: str) -> t.Union[str, bytes]: ) from ex def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None: - """Assign a value using key + """Assign a value using key. :param key: Unique key of an item to set in the feature store - :param value: Value to persist in the feature store""" + :param value: Value to persist in the feature store + """ self._check_reserved(key) self._set(key, value) @@ -133,52 +140,58 @@ def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. :param key: Unique key of an item to retrieve from the feature store - :returns: `True` if the key is found, `False` otherwise""" + :returns: `True` if the key is found, `False` otherwise + """ return self._contains(key) @abstractmethod def _get(self, key: str) -> t.Union[str, bytes]: - """Retrieve a value from the underlying storage mechanism + """Retrieve a value from the underlying storage mechanism. :param key: The unique key that identifies the resource - :returns: the value identified by the key - :raises KeyError: if the key has not been used to store a value""" + :returns: The value identified by the key + :raises KeyError: If the key has not been used to store a value + """ @abstractmethod def _set(self, key: str, value: t.Union[str, bytes]) -> None: - """Store a value into the underlying storage mechanism + """Store a value into the underlying storage mechanism. :param key: The unique key that identifies the resource :param value: The value to store - :returns: the value identified by the key - :raises KeyError: if the key has not been used to store a value""" + :raises KeyError: If the key has not been used to store a value + """ @abstractmethod def _contains(self, key: str) -> bool: - """Determine if the storage mechanism contains a given key + """Determine if the storage mechanism contains a given key. :param key: The unique key that identifies the resource - :returns: `True` if the key is defined, `False` otherwise""" + :returns: `True` if the key is defined, `False` otherwise + """ @property def _allow_reserved_writes(self) -> bool: """Return the boolean flag indicating if writing to reserved keys is - enabled for this feature store + enabled for this feature store. - :returns: `True` if enabled, `False` otherwise""" + :returns: `True` if enabled, `False` otherwise + """ return self._enable_reserved_writes @_allow_reserved_writes.setter def _allow_reserved_writes(self, value: bool) -> None: """Modify the boolean flag indicating if writing to reserved keys is - enabled for this feature store + enabled for this feature store. - :param value: the new value to set for the flag""" + :param value: The new value to set for the flag + """ self._enable_reserved_writes = value @property def descriptor(self) -> str: - """Unique identifier enabling a client to connect to the feature store + """Unique identifier enabling a client to connect to the feature store. - :returns: A descriptor encoded as a string""" + :returns: A descriptor encoded as a string + """ return self._descriptor diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 25a5ed0177..ad152e5d7e 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -257,7 +257,7 @@ class RequestBatch: def has_valid_requests(self) -> bool: """Returns whether the batch contains at least one request. - :return: True if at least one request is available + :returns: True if at least one request is available """ return len(self.requests) > 0 @@ -265,7 +265,7 @@ def has_valid_requests(self) -> bool: def has_raw_model(self) -> bool: """Returns whether the batch has a raw model. - :return: True if the batch has a raw model + :returns: True if the batch has a raw model """ return self.raw_model is not None @@ -274,7 +274,7 @@ def raw_model(self) -> t.Optional[t.Any]: """Returns the raw model to use to execute for this batch if it is available. - :return: A model if available, otherwise None""" + :returns: A model if available, otherwise None""" if self.has_valid_requests: return self.requests[0].raw_model return None @@ -283,7 +283,7 @@ def raw_model(self) -> t.Optional[t.Any]: def input_keys(self) -> t.List[FeatureStoreKey]: """All input keys available in this batch's requests. - :return: All input keys belonging to requests in this batch""" + :returns: All input keys belonging to requests in this batch""" keys = [] for request in self.requests: keys.extend(request.input_keys) @@ -294,7 +294,7 @@ def input_keys(self) -> t.List[FeatureStoreKey]: def output_keys(self) -> t.List[FeatureStoreKey]: """All output keys available in this batch's requests. - :return: All output keys belonging to requests in this batch""" + :returns: All output keys belonging to requests in this batch""" keys = [] for request in self.requests: keys.extend(request.output_keys) @@ -368,7 +368,7 @@ def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: information will be in the form of TensorKeys or TensorDescriptors. :param reply: The reply that the output belongs to - :return: The list of prepared outputs, depending on the output + :returns: The list of prepared outputs, depending on the output information needed in the reply """ prepared_outputs: t.List[t.Any] = [] @@ -396,7 +396,7 @@ def fetch_model( :param batch: The batch of requests that triggered the pipeline :param feature_stores: Available feature stores used for persistence - :return: Raw bytes of the model + :returns: Raw bytes of the model :raises SmartSimError: If neither a key or a model are provided or the model cannot be retrieved from the feature store :raises ValueError: If a feature store is not available and a raw @@ -433,7 +433,7 @@ def fetch_inputs( :param batch: The batch of requests that triggered the pipeline :param feature_stores: Available feature stores used for persistence - :return: The fetched input + :returns: The fetched input :raises ValueError: If neither an input key or an input tensor are provided :raises SmartSimError: If a tensor for a given key cannot be retrieved""" fetch_results = [] @@ -481,7 +481,7 @@ def place_output( :param request: The request that triggered the pipeline :param execute_result: Results from inference :param feature_stores: Available feature stores used for persistence - :return: A collection of keys that were placed in the feature store + :returns: A collection of keys that were placed in the feature store :raises ValueError: If a feature store is not provided """ if not feature_stores: @@ -514,7 +514,7 @@ def load_model( :param request: The request that triggered the pipeline :param device: The device on which the model must be placed - :return: LoadModelResult wrapping the model loaded for the request""" + :returns: LoadModelResult wrapping the model loaded for the request""" @staticmethod @abstractmethod @@ -529,7 +529,7 @@ def transform_input( :param request: The request that triggered the pipeline :param fetch_result: Raw outputs from fetching inputs out of a feature store :param mem_pool: The memory pool used to access batched input tensors - :return: The transformed inputs wrapped in a TransformInputResult""" + :returns: The transformed inputs wrapped in a TransformInputResult""" @staticmethod @abstractmethod @@ -545,7 +545,7 @@ def execute( :param load_result: The result of loading the model onto device memory :param transform_result: The result of transforming inputs for model consumption :param device: The device on which the model will be executed - :return: The result of inference wrapped in an ExecuteResult""" + :returns: The result of inference wrapped in an ExecuteResult""" @staticmethod @abstractmethod @@ -557,4 +557,4 @@ def transform_output( :param batch: The batch of requests that triggered the pipeline :param execute_result: The result of inference wrapped in an ExecuteResult - :return: A list of transformed outputs""" + :returns: A list of transformed outputs""" diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index efc80c5195..5b6f846fc8 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -48,7 +48,7 @@ def build_tensor_descriptor( :param order: Order of the tensor, such as row-major (c) or column-major (f) :param data_type: Data type of the tensor :param dimensions: Dimensions of the tensor - :return: The TensorDescriptor + :returns: The TensorDescriptor :raises ValueError: If building fails """ try: @@ -76,7 +76,7 @@ def build_output_tensor_descriptor( :param keys: List of TensorKeys to apply transorm descriptor to :param data_type: Tranform data type of the tensor :param dimensions: Transform dimensions of the tensor - :return: The OutputDescriptor + :returns: The OutputDescriptor :raises ValueError: If building fails """ try: @@ -101,7 +101,7 @@ def build_tensor_key( :param key: String to set the TensorKey :param feature_store_descriptor: A descriptor identifying the feature store containing the key - :return: The TensorKey + :returns: The TensorKey :raises ValueError: If building fails """ try: @@ -120,7 +120,7 @@ def build_model(data: bytes, name: str, version: str) -> model_capnp.Model: :param data: Model data :param name: Model name :param version: Model version - :return: The Model + :returns: The Model :raises ValueError: If building fails """ try: @@ -142,7 +142,7 @@ def build_model_key( :param key: String to set the ModelKey :param feature_store_descriptor: A descriptor identifying the feature store containing the key - :return: The ModelKey + :returns: The ModelKey :raises ValueError: If building fails """ try: @@ -161,7 +161,7 @@ def build_torch_request_attributes( Builds a new TorchRequestAttributes message with the provided tensor type. :param tensor_type: Type of the tensor passed in - :return: The TorchRequestAttributes + :returns: The TorchRequestAttributes :raises ValueError: If building fails """ try: @@ -181,7 +181,7 @@ def build_tf_request_attributes( :param name: Name of the tensor :param tensor_type: Type of the tensor passed in - :return: The TensorFlowRequestAttributes + :returns: The TensorFlowRequestAttributes :raises ValueError: If building fails """ try: @@ -201,7 +201,7 @@ def build_torch_response_attributes() -> ( """ Builds a new TorchResponseAttributes message. - :return: The TorchResponseAttributes + :returns: The TorchResponseAttributes """ return response_attributes_capnp.TorchResponseAttributes.new_message() @@ -212,7 +212,7 @@ def build_tf_response_attributes() -> ( """ Builds a new TensorFlowResponseAttributes message. - :return: The TensorFlowResponseAttributes + :returns: The TensorFlowResponseAttributes """ return response_attributes_capnp.TensorFlowResponseAttributes.new_message() @@ -383,7 +383,7 @@ def build_request( :param outputs: Outputs to be assigned to request :param output_descriptors: Output descriptors to be assigned to request :param custom_attributes: Custom attributes to be assigned to request - :return: The Request + :returns: The Request """ request = request_capnp.Request.new_message() MessageHandler._assign_reply_channel(request, reply_channel) @@ -400,7 +400,7 @@ def serialize_request(request: request_capnp.RequestBuilder) -> bytes: Serializes a built request message. :param request: Request to be serialized - :return: Serialized request bytes + :returns: Serialized request bytes """ return request.to_bytes() @@ -410,7 +410,7 @@ def deserialize_request(request_bytes: bytes) -> request_capnp.Request: Deserializes a serialized request message. :param request_bytes: Bytes to be deserialized into a request - :return: Deserialized request + :returns: Deserialized request """ bytes_message = request_capnp.Request.from_bytes( request_bytes, traversal_limit_in_words=2**63 @@ -536,7 +536,7 @@ def build_response( :param message: Message to be assigned to response :param result: Result to be assigned to response :param custom_attributes: Custom attributes to be assigned to response - :return: The Response + :returns: The Response """ response = response_capnp.Response.new_message() MessageHandler._assign_status(response, status) @@ -551,7 +551,7 @@ def serialize_response(response: response_capnp.ResponseBuilder) -> bytes: Serializes a built response message. :param response: Response to be serialized - :return: Serialized response bytes + :returns: Serialized response bytes """ return response.to_bytes() @@ -561,7 +561,7 @@ def deserialize_response(response_bytes: bytes) -> response_capnp.Response: Deserializes a serialized response message. :param response_bytes: Bytes to be deserialized into a response - :return: Deserialized response + :returns: Deserialized response """ bytes_message = response_capnp.Response.from_bytes( response_bytes, traversal_limit_in_words=2**63 From 3e9bffae587654ad88bb5257df37089f5cfdc970 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Thu, 12 Sep 2024 17:11:23 -0400 Subject: [PATCH 27/60] Parametrize dragon install (#703) Parameterize the `smart build --dragon` command to enable specification of a fork/repository and package version - add parameter `--dragon-repo` - add parameter `--dragon-version` Sample usage: - Get latest version from private fork `GH_TOKEN=xxxxx smart build --dragon-repo ankona/dragonfork` - Get specific version from private fork `GH_TOKEN=xxxxx smart build --dragon-repo ankona/dragonfork --dragon-version 0.10` - Get specific version from public fork - `GH_TOKEN=xxxxx smart build --dragon-repo dragonhpc/dragon --dragon-version 0.10` - `GH_TOKEN=xxxxx smart build --dragon-version 0.10` - `smart build --dragon-repo dragonhpc/dragon --dragon-version 0.10` - `smart build --dragon-version 0.10` ## manual test results 1. OK - `smart build --dragon` ``` [1] % smart build --dragon [SmartSim] INFO Running SmartSim build process... [SmartSim] INFO Checking requested versions... [SmartSim] DEBUG Checking for build tools... [SmartSim] DEBUG Retrieved asset metadata: GitReleaseAsset(url="https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358") [SmartSim] DEBUG Retrieved asset dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz metadata from https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358 [SmartSim] DEBUG Extracted dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz to /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] DEBUG Installing packages: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/dragon-0.9-cp311-cp311-linux_x86_64.whl /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/pycapnp-2.0.0-cp311-cp311-linux_x86_64.whl [SmartSim] DEBUG Deleted temporary files in: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] INFO Dragon installation complete [SmartSim] INFO Redis build complete! ``` 2. OK_FAIL - `smart build --dragon-repo dragonhpc/dragon-nightly` ``` [1] % smart build --dragon-repo dragonhpc/dragon-nightly [SmartSim] INFO Running SmartSim build process... [SmartSim] INFO Checking requested versions... [SmartSim] DEBUG Checking for build tools... [SmartSim] ERROR An access token must be available to access dragonhpc/dragon-nightly. Set the `GH_TOKEN` env var to pass your access token. [SmartSim] WARNING Dragon installation failed [SmartSim] INFO Redis build complete! ``` 3. OK_FAIL - `smart build --dragon-repo dragonhpc/dragon-nightly --dragon-version 0.9` - no gh_token supplied to connect to private repo, no version 0.9 in that repo ``` [130] % smart build --dragon-repo dragonhpc/dragon-nightly --dragon-version 0.9 [SmartSim] INFO Running SmartSim build process... [SmartSim] INFO Checking requested versions... [SmartSim] DEBUG Checking for build tools... [SmartSim] ERROR An access token must be available to access dragonhpc/dragon-nightly. Set the `GH_TOKEN` env var to pass your access token. [SmartSim] WARNING Dragon installation failed [SmartSim] INFO Redis build complete! ``` 3. OK_FAIL - `GH_TOKEN=xxx smart build --dragon-repo dragonhpc/dragon-nightly --dragon-version 0.9` - no version 0.9 in that repo ``` [SmartSim] INFO Running SmartSim build process... [SmartSim] INFO Checking requested versions... [SmartSim] DEBUG Checking for build tools... [SmartSim] WARNING Please specify a dragon version (e.g. 0.9) of an asset available in the repository: dragon-0.10-py3.10.10-ff4c77a60.tar.gz dragon-0.10-py3.11.5-ff4c77a60.tar.gz dragon-0.10-py3.9.4.1-ff4c77a60.tar.gz dragondocs-0.10-ff4c77a60.tar.gz [SmartSim] WARNING No dragon runtime asset available to install [SmartSim] WARNING Dragon installation failed [SmartSim] INFO Redis build complete! ``` 4. OK FAIL - `smart build --dragon-repo dragonhpc/dragon-nightly --dragon-version 0.10` - no gh_token supplied to connect to private repo ``` [1] % smart build --dragon-repo dragonhpc/dragon-nightly --dragon-version 0.10 [SmartSim] INFO Running SmartSim build process... [SmartSim] INFO Checking requested versions... [SmartSim] DEBUG Checking for build tools... [SmartSim] ERROR An access token must be available to access dragonhpc/dragon-nightly. Set the `GH_TOKEN` env var to pass your access token. [SmartSim] WARNING Dragon installation failed [SmartSim] INFO Redis build complete! ``` 5. OK - `GH_TOKEN=xxx smart build --dragon-repo dragonhpc/dragon-nightly --dragon-version 0.10` ``` [SmartSim] INFO Running SmartSim build process... [SmartSim] INFO Checking requested versions... [SmartSim] DEBUG Checking for build tools... [SmartSim] WARNING Platform-specific package not found. Using dragon-0.10-py3.11.5-ff4c77a60.tar.gz [SmartSim] DEBUG Retrieved asset metadata: GitReleaseAsset(url="https://api.github.com/repos/DragonHPC/dragon-nightly/releases/assets/190659388") [SmartSim] DEBUG Retrieved asset dragon-0.10-py3.11.5-ff4c77a60.tar.gz metadata from https://api.github.com/repos/DragonHPC/dragon-nightly/releases/assets/190659388 [SmartSim] DEBUG Extracted dragon-0.10-py3.11.5-ff4c77a60.tar.gz to /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/190659388 [SmartSim] DEBUG Installing packages: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/190659388/dragon-0.10-py3.11.5-ff4c77a60.tar.gz/dragon-0.10/pycapnp-2.0.0-cp311-cp311-linux_x86_64.whl /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/190659388/dragon-0.10-py3.11.5-ff4c77a60.tar.gz/dragon-0.10/dragon-0.10-cp311-cp311-linux_x86_64.whl [SmartSim] DEBUG Deleted temporary files in: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/190659388 [SmartSim] INFO Dragon installation complete [SmartSim] INFO Redis build complete! ``` 5. OK_FAIL - `smart build --dragon-version 0.10` - no v0.10 in that repo ``` [SmartSim] INFO Running SmartSim build process... [SmartSim] INFO Checking requested versions... [SmartSim] DEBUG Checking for build tools... [SmartSim] WARNING Please specify a dragon version (e.g. 0.9) of an asset available in the repository: dragon-0.9-py3.10.10-CRAYEX-ce895d2de.tar.gz dragon-0.9-py3.10.10-ec3fc0f8a.tar.gz dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz dragon-0.9-py3.11.5-ec3fc0f8a.tar.gz dragon-0.9-py3.9.4.1-CRAYEX-ce895d2de.tar.gz dragon-0.9-py3.9.4.1-ec3fc0f8a.tar.gz [SmartSim] WARNING No dragon runtime asset available to install [SmartSim] WARNING Dragon installation failed [SmartSim] INFO Redis build complete! ``` 6. OK - `smart build --dragon-version 0.9` ``` [SmartSim] INFO Running SmartSim build process... [SmartSim] INFO Checking requested versions... [SmartSim] DEBUG Checking for build tools... [SmartSim] DEBUG Retrieved asset metadata: GitReleaseAsset(url="https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358") [SmartSim] DEBUG Retrieved asset dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz metadata from https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358 [SmartSim] DEBUG Extracted dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz to /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] DEBUG Installing packages: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/dragon-0.9-cp311-cp311-linux_x86_64.whl /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/pycapnp-2.0.0-cp311-cp311-linux_x86_64.whl [SmartSim] DEBUG Deleted temporary files in: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] INFO Dragon installation complete [SmartSim] INFO Redis build complete! ``` 7. OK - `export GH_TOKEN=xxx smart build --dragon-version 0.9` ``` [SmartSim] INFO Running SmartSim build process... [SmartSim] INFO Checking requested versions... [SmartSim] DEBUG Checking for build tools... [SmartSim] DEBUG Retrieved asset metadata: GitReleaseAsset(url="https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358") [SmartSim] DEBUG Retrieved asset dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz metadata from https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358 [SmartSim] DEBUG Extracted dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz to /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] DEBUG Installing packages: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/dragon-0.9-cp311-cp311-linux_x86_64.whl /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/pycapnp-2.0.0-cp311-cp311-linux_x86_64.whl [SmartSim] DEBUG Deleted temporary files in: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] INFO Dragon installation complete [SmartSim] INFO Redis build complete! ``` 8. OK - `smart build --dragon-repo dragonhpc/dragon` - handle lower case fine! ``` [1] % smart build --dragon-repo dragonhpc/dragon [SmartSim] INFO Running SmartSim build process... [SmartSim] INFO Checking requested versions... [SmartSim] DEBUG Checking for build tools... [SmartSim] DEBUG Retrieved asset metadata: GitReleaseAsset(url="https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358") [SmartSim] DEBUG Retrieved asset dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz metadata from https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358 [SmartSim] DEBUG Extracted dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz to /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] DEBUG Installing packages: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/dragon-0.9-cp311-cp311-linux_x86_64.whl /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/pycapnp-2.0.0-cp311-cp311-linux_x86_64.whl [SmartSim] DEBUG Deleted temporary files in: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] INFO Dragon installation complete [SmartSim] INFO Redis build complete! ``` 9. OK - `GH_TOKEN=xxx smart build --dragon-repo dragonhpc/dragon` - token not required for public, but works: ``` [SmartSim] INFO Running SmartSim build process... [SmartSim] INFO Checking requested versions... [SmartSim] DEBUG Checking for build tools... [SmartSim] DEBUG Retrieved asset metadata: GitReleaseAsset(url="https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358") [SmartSim] DEBUG Retrieved asset dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz metadata from https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358 [SmartSim] DEBUG Extracted dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz to /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] DEBUG Installing packages: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/dragon-0.9-cp311-cp311-linux_x86_64.whl /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/pycapnp-2.0.0-cp311-cp311-linux_x86_64.whl [SmartSim] DEBUG Deleted temporary files in: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] INFO Dragon installation complete [SmartSim] INFO Redis build complete! ``` 8. OK - `smart build --dragon-repo dragonhpc/dragon --dragon-version 0.9` ``` [1] % smart build --dragon-repo dragonhpc/dragon --dragon-version 0.9 [SmartSim] INFO Running SmartSim build process... [SmartSim] INFO Checking requested versions... [SmartSim] DEBUG Checking for build tools... [SmartSim] DEBUG Retrieved asset metadata: GitReleaseAsset(url="https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358") [SmartSim] DEBUG Retrieved asset dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz metadata from https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358 [SmartSim] DEBUG Extracted dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz to /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] DEBUG Installing packages: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/dragon-0.9-cp311-cp311-linux_x86_64.whl /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/pycapnp-2.0.0-cp311-cp311-linux_x86_64.whl [SmartSim] DEBUG Deleted temporary files in: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] INFO Dragon installation complete [SmartSim] INFO Redis build complete! ``` 9. OK - `GH_TOKEN=xxx smart build --dragon-repo dragonhpc/dragon --dragon-version 0.9` ``` [SmartSim] INFO Running SmartSim build process... [SmartSim] INFO Checking requested versions... [SmartSim] DEBUG Checking for build tools... [SmartSim] DEBUG Retrieved asset metadata: GitReleaseAsset(url="https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358") [SmartSim] DEBUG Retrieved asset dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz metadata from https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358 [SmartSim] DEBUG Extracted dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz to /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] DEBUG Installing packages: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/dragon-0.9-cp311-cp311-linux_x86_64.whl /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/pycapnp-2.0.0-cp311-cp311-linux_x86_64.whl [SmartSim] DEBUG Deleted temporary files in: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] INFO Dragon installation complete [SmartSim] INFO Redis build complete! ``` 10. OK - 1. `smart build --dragon --dragon-version 0.9` ``` [SmartSim] INFO Running SmartSim build process... [SmartSim] INFO Checking requested versions... [SmartSim] DEBUG Checking for build tools... [SmartSim] DEBUG Retrieved asset metadata: GitReleaseAsset(url="https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358") [SmartSim] DEBUG Retrieved asset dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz metadata from https://api.github.com/repos/DragonHPC/dragon/releases/assets/165524358 [SmartSim] DEBUG Extracted dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz to /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] DEBUG Installing packages: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/dragon-0.9-cp311-cp311-linux_x86_64.whl /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358/dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz/dragon-0.9/pycapnp-2.0.0-cp311-cp311-linux_x86_64.whl [SmartSim] DEBUG Deleted temporary files in: /lus/bnchlu1/mcbridch/code/ss/smartsim/_core/.dragon/165524358 [SmartSim] INFO Dragon installation complete [SmartSim] INFO Redis build complete! ``` 11. OK_FAIL - 1. `smart build --dragon --dragon-version 0.10` ``` [1] % smart build --dragon --dragon-version 0.10 [SmartSim] INFO Running SmartSim build process... [SmartSim] INFO Checking requested versions... [SmartSim] DEBUG Checking for build tools... [SmartSim] WARNING Please specify a dragon version (e.g. 0.9) of an asset available in the repository: dragon-0.9-py3.10.10-CRAYEX-ce895d2de.tar.gz dragon-0.9-py3.10.10-ec3fc0f8a.tar.gz dragon-0.9-py3.11.5-CRAYEX-ce895d2de.tar.gz dragon-0.9-py3.11.5-ec3fc0f8a.tar.gz dragon-0.9-py3.9.4.1-CRAYEX-ce895d2de.tar.gz dragon-0.9-py3.9.4.1-ec3fc0f8a.tar.gz [SmartSim] WARNING No dragon runtime asset available to install [SmartSim] WARNING Dragon installation failed [SmartSim] INFO Redis build complete! ``` 12. OK - `smart build -h` ``` --dragon-repo DRAGON_REPO Specify a git repo containing dragon release assets (e.g. DragonHPC/dragon) --dragon-version DRAGON_VERSION Specify the dragon version to install (e.g. 0.9) ``` [ committed by @ankona ] [ reviewed by @AlyssaCote ] --- .github/workflows/run_tests.yml | 4 +- doc/changelog.md | 1 + doc/installation_instructions/basic.rst | 14 + smartsim/_core/_cli/build.py | 42 ++- smartsim/_core/_cli/scripts/dragon_install.py | 337 ++++++++++++++---- smartsim/_core/_install/builder.py | 79 +++- smartsim/_core/config/config.py | 4 - tests/dragon/channel.py | 127 +++++++ tests/dragon/test_featurestore_base.py | 5 +- tests/test_dragon_installer.py | 134 ++++--- tests/test_dragon_launcher.py | 11 +- 11 files changed, 583 insertions(+), 175 deletions(-) create mode 100644 tests/dragon/channel.py diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 089493d3b3..f0b0ba6663 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -117,8 +117,10 @@ jobs: - name: Install ML Runtimes with Smart (with pt, tf, dragon, and onnx support) if: (contains( matrix.os, 'ubuntu' ) || contains( matrix.os, 'macos-12')) && ( matrix.subset == 'dragon' ) + env: + SMARTSIM_DRAGON_TOKEN: ${{ secrets.DRAGON_TOKEN }} run: | - smart build --device cpu --onnx --dragon -v + smart build --device cpu --onnx -v --dragon-repo dragonhpc/dragon-nightly --dragon-version 0.10 SP=$(python -c 'import site; print(site.getsitepackages()[0])')/smartsim/_core/config/dragon/.env LLP=$(cat $SP | grep LD_LIBRARY_PATH | awk '{split($0, array, "="); print array[2]}') echo "LD_LIBRARY_PATH=$LLP:$LD_LIBRARY_PATH" >> $GITHUB_ENV diff --git a/doc/changelog.md b/doc/changelog.md index 6e53070a19..79163733b7 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Parameterize installation of dragon package with `smart build` - Update docstrings - Implement asynchronous notifications for shared data - Filenames conform to snake case diff --git a/doc/installation_instructions/basic.rst b/doc/installation_instructions/basic.rst index 02c17e1fda..8b6ce83947 100644 --- a/doc/installation_instructions/basic.rst +++ b/doc/installation_instructions/basic.rst @@ -255,6 +255,20 @@ For example, to install dragon alongside the RedisAI CPU backends, you can run smart build --device cpu --dragon # install Dragon, PT and TF for cpu smart build --device cpu --onnx --dragon # install Dragon and all backends (PT, TF, ONNX) on cpu +``smart build`` supports installing a specific version of dragon. It exposes the +parameters ``--dragon-repo`` and ``--dragon-version``, which can be used alone or +in combination to customize the Dragon installation. For example: + +.. code-block:: bash + + # using the --dragon-repo and --dragon-version flags to customize the Dragon installation + smart build --device cpu --dragon-repo userfork/dragon # install Dragon from a specific repo + smart build --device cpu --dragon-version 0.10 # install a specific Dragon release + + # combining both flags + smart build --device cpu --dragon-repo userfork/dragon --dragon-version 0.91 + + .. note:: Dragon is only supported on Linux systems. For further information, you can read :ref:`the dedicated documentation page `. diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py index 65a5504c6f..cbcbc71e6c 100644 --- a/smartsim/_core/_cli/build.py +++ b/smartsim/_core/_cli/build.py @@ -33,7 +33,13 @@ from tabulate import tabulate -from smartsim._core._cli.scripts.dragon_install import install_dragon +from smartsim._core._cli.scripts.dragon_install import ( + DEFAULT_DRAGON_REPO, + DEFAULT_DRAGON_VERSION, + DragonInstallRequest, + display_post_install_logs, + install_dragon, +) from smartsim._core._cli.utils import SMART_LOGGER_FORMAT, color_bool, pip from smartsim._core._install import builder from smartsim._core._install.buildenv import ( @@ -380,6 +386,8 @@ def execute( keydb = args.keydb device = Device(args.device.lower()) is_dragon_requested = args.dragon + dragon_repo = args.dragon_repo + dragon_version = args.dragon_version # torch and tf build by default pt = not args.no_pt # pylint: disable=invalid-name tf = not args.no_tf # pylint: disable=invalid-name @@ -409,12 +417,21 @@ def execute( version_names = list(vers.keys()) print(tabulate(vers, headers=version_names, tablefmt="github"), "\n") - if is_dragon_requested: - install_to = CONFIG.core_path / ".dragon" - return_code = install_dragon(install_to) + if is_dragon_requested or dragon_repo or dragon_version: + try: + request = DragonInstallRequest( + CONFIG.core_path / ".dragon", + dragon_repo, + dragon_version, + ) + return_code = install_dragon(request) + except ValueError as ex: + return_code = 2 + logger.error(" ".join(ex.args)) if return_code == 0: - logger.info("Dragon installation complete") + display_post_install_logs() + elif return_code == 1: logger.info("Dragon installation not supported on platform") else: @@ -483,6 +500,21 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: default=False, help="Install the dragon runtime", ) + parser.add_argument( + "--dragon-repo", + default=None, + type=str, + help=( + "Specify a git repo containing dragon release assets " + f"(e.g. {DEFAULT_DRAGON_REPO})" + ), + ) + parser.add_argument( + "--dragon-version", + default=None, + type=str, + help=f"Specify the dragon version to install (e.g. {DEFAULT_DRAGON_VERSION})", + ) parser.add_argument( "--only_python_packages", action="store_true", diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index f88af4eb4f..4fd0be3004 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -1,15 +1,19 @@ import os import pathlib +import re import shutil import sys import typing as t -from urllib.request import urlretrieve +from urllib.request import Request, urlopen from github import Github +from github.Auth import Token +from github.GitRelease import GitRelease from github.GitReleaseAsset import GitReleaseAsset +from github.Repository import Repository from smartsim._core._cli.utils import pip -from smartsim._core._install.builder import WebTGZ +from smartsim._core._install.builder import _WebTGZ from smartsim._core.config import CONFIG from smartsim._core.utils.helpers import check_platform, is_crayex_platform from smartsim.error.errors import SmartSimCLIActionCancelled @@ -17,8 +21,78 @@ logger = get_logger(__name__) +DEFAULT_DRAGON_REPO = "DragonHPC/dragon" +DEFAULT_DRAGON_VERSION = "0.9" +DEFAULT_DRAGON_VERSION_TAG = f"v{DEFAULT_DRAGON_VERSION}" +_GH_TOKEN = "SMARTSIM_DRAGON_TOKEN" -def create_dotenv(dragon_root_dir: pathlib.Path) -> None: + +class DragonInstallRequest: + """Encapsulates a request to install the dragon package""" + + def __init__( + self, + working_dir: pathlib.Path, + repo_name: t.Optional[str] = None, + version: t.Optional[str] = None, + ) -> None: + """Initialize an install request. + + :param working_dir: A path to store temporary files used during installation + :param repo_name: The name of a repository to install from, e.g. DragonHPC/dragon + :param version: The version to install, e.g. v0.10 + """ + + self.working_dir = working_dir + """A path to store temporary files used during installation""" + + self.repo_name = repo_name or DEFAULT_DRAGON_REPO + """The name of a repository to install from, e.g. DragonHPC/dragon""" + + self.pkg_version = version or DEFAULT_DRAGON_VERSION + """The version to install, e.g. 0.10""" + + self._check() + + def _check(self) -> None: + """Perform validation of this instance + + :raises: ValueError if any value fails validation""" + if not self.repo_name or len(self.repo_name.split("/")) != 2: + raise ValueError( + f"Invalid dragon repository name. Example: `dragonhpc/dragon`" + ) + + # version must match standard dragon tag & filename format `vX.YZ` + match = re.match(r"^\d\.\d+$", self.pkg_version) + if not self.pkg_version or not match: + raise ValueError("Invalid dragon version. Examples: `0.9, 0.91, 0.10`") + + # attempting to retrieve from a non-default repository requires an auth token + if self.repo_name.lower() != DEFAULT_DRAGON_REPO.lower() and not self.raw_token: + raise ValueError( + f"An access token must be available to access {self.repo_name}. " + f"Set the `{_GH_TOKEN}` env var to pass your access token." + ) + + @property + def raw_token(self) -> t.Optional[str]: + """Returns the raw access token from the environment, if available""" + return os.environ.get(_GH_TOKEN, None) + + +def get_auth_token(request: DragonInstallRequest) -> t.Optional[Token]: + """Create a Github.Auth.Token if an access token can be found + in the environment + + :param request: details of a request for the installation of the dragon package + :returns: an auth token if one can be built, otherwise `None`""" + if gh_token := request.raw_token: + return Token(gh_token) + return None + + +def create_dotenv(dragon_root_dir: pathlib.Path, dragon_version: str) -> None: """Create a .env file with required environment variables for the Dragon runtime""" dragon_root = str(dragon_root_dir) dragon_inc_dir = str(dragon_root_dir / "include") @@ -30,7 +104,7 @@ def create_dotenv(dragon_root_dir: pathlib.Path) -> None: "DRAGON_ROOT_DIR": dragon_root, # note: same as base_dir "DRAGON_INCLUDE_DIR": dragon_inc_dir, "DRAGON_LIB_DIR": dragon_lib_dir, - "DRAGON_VERSION": dragon_pin(), + "DRAGON_VERSION": dragon_version, "PATH": dragon_bin_dir, "LD_LIBRARY_PATH": dragon_lib_dir, } @@ -50,12 +124,6 @@ def python_version() -> str: return f"py{sys.version_info.major}.{sys.version_info.minor}" -def dragon_pin() -> str: - """Return a string indicating the pinned major/minor version of the dragon - package to install""" - return "0.9" - - def _platform_filter(asset_name: str) -> bool: """Return True if the asset name matches naming standard for current platform (Cray or non-Cray). Otherwise, returns False. @@ -77,67 +145,125 @@ def _version_filter(asset_name: str) -> bool: return python_version() in asset_name -def _pin_filter(asset_name: str) -> bool: +def _pin_filter(asset_name: str, dragon_version: str) -> bool: """Return true if the supplied value contains a dragon version pin match - :param asset_name: A value to inspect for keywords indicating a dragon version + :param asset_name: the asset name to inspect for keywords indicating a dragon version + :param dragon_version: the dragon version to match :returns: True if supplied value is correct for current dragon version""" - return f"dragon-{dragon_pin()}" in asset_name + return f"dragon-{dragon_version}" in asset_name + +def _get_all_releases(dragon_repo: Repository) -> t.Collection[GitRelease]: + """Retrieve all available releases for the configured dragon repository -def _get_release_assets() -> t.Collection[GitReleaseAsset]: + :param dragon_repo: A GitHub repository object for the dragon package + :returns: A list of GitRelease""" + all_releases = [release for release in list(dragon_repo.get_releases())] + return all_releases + + +def _get_release_assets(request: DragonInstallRequest) -> t.Collection[GitReleaseAsset]: """Retrieve a collection of available assets for all releases that satisfy the dragon version pin + :param request: details of a request for the installation of the dragon package :returns: A collection of release assets""" - git = Github() - - dragon_repo = git.get_repo("DragonHPC/dragon") + auth = get_auth_token(request) + git = Github(auth=auth) + dragon_repo = git.get_repo(request.repo_name) if dragon_repo is None: raise SmartSimCLIActionCancelled("Unable to locate dragon repo") - # find any releases matching our pinned version requirement - tags = [tag for tag in dragon_repo.get_tags() if dragon_pin() in tag.name] - # repo.get_latest_release fails if only pre-release results are returned - pin_releases = list(dragon_repo.get_release(tag.name) for tag in tags) - releases = sorted(pin_releases, key=lambda r: r.published_at, reverse=True) + all_releases = sorted( + _get_all_releases(dragon_repo), key=lambda r: r.published_at, reverse=True + ) - # take the most recent release for the given pin - assets = releases[0].assets + # filter the list of releases to include only the target version + releases = [ + release + for release in all_releases + if request.pkg_version in release.title or release.tag_name + ] + + releases = sorted(releases, key=lambda r: r.published_at, reverse=True) + + if not releases: + release_titles = ", ".join(release.title for release in all_releases) + raise SmartSimCLIActionCancelled( + f"Unable to find a release for dragon version {request.pkg_version}. " + f"Available releases: {release_titles}" + ) + + assets: t.List[GitReleaseAsset] = [] + + # install the latest release of the target version (including pre-release) + for release in releases: + # delay in attaching release assets may leave us with an empty list, retry + # with the next available release + if assets := list(release.get_assets()): + logger.debug(f"Found assets for dragon release {release.title}") + break + else: + logger.debug(f"No assets for dragon release {release.title}. Retrying.") + + if not assets: + raise SmartSimCLIActionCancelled( + f"Unable to find assets for dragon release {release.title}" + ) return assets -def filter_assets(assets: t.Collection[GitReleaseAsset]) -> t.Optional[GitReleaseAsset]: +def filter_assets( + request: DragonInstallRequest, assets: t.Collection[GitReleaseAsset] +) -> t.Optional[GitReleaseAsset]: """Filter the available release assets so that HSTA agents are used when run on a Cray EX platform + :param request: details of a request for the installation of the dragon package :param assets: The collection of dragon release assets to filter :returns: An asset meeting platform & version filtering requirements""" # Expect cray & non-cray assets that require a filter, e.g. # 'dragon-0.8-py3.9.4.1-bafaa887f.tar.gz', # 'dragon-0.8-py3.9.4.1-CRAYEX-ac132fe95.tar.gz' - asset = next( - ( - asset - for asset in assets - if _version_filter(asset.name) - and _platform_filter(asset.name) - and _pin_filter(asset.name) - ), - None, + all_assets = [asset.name for asset in assets] + + assets = list( + asset + for asset in assets + if _version_filter(asset.name) and _pin_filter(asset.name, request.pkg_version) ) + + if len(assets) == 0: + available = "\n\t".join(all_assets) + logger.warning( + f"Please specify a dragon version (e.g. {DEFAULT_DRAGON_VERSION}) " + f"of an asset available in the repository:\n\t{available}" + ) + return None + + asset: t.Optional[GitReleaseAsset] = None + + # Apply platform filter if we have multiple matches for python/dragon version + if len(assets) > 0: + asset = next((asset for asset in assets if _platform_filter(asset.name)), None) + + if not asset: + asset = assets[0] + logger.warning(f"Platform-specific package not found. Using {asset.name}") + return asset -def retrieve_asset_info() -> GitReleaseAsset: +def retrieve_asset_info(request: DragonInstallRequest) -> GitReleaseAsset: """Find a release asset that meets all necessary filtering criteria - :param dragon_pin: identify the dragon version to install (e.g. dragon-0.8) + :param request: details of a request for the installation of the dragon package :returns: A GitHub release asset""" - assets = _get_release_assets() - asset = filter_assets(assets) + assets = _get_release_assets(request) + asset = filter_assets(request, assets) platform_result = check_platform() if not platform_result.is_cray: @@ -152,55 +278,77 @@ def retrieve_asset_info() -> GitReleaseAsset: return asset -def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib.Path: +def retrieve_asset( + request: DragonInstallRequest, asset: GitReleaseAsset +) -> pathlib.Path: """Retrieve the physical file associated to a given GitHub release asset - :param working_dir: location in file system where assets should be written + :param request: details of a request for the installation of the dragon package :param asset: GitHub release asset to retrieve - :returns: path to the directory containing the extracted release asset""" - download_dir = working_dir / str(asset.id) + :returns: path to the directory containing the extracted release asset + :raises: SmartSimCLIActionCancelled if the asset cannot be downloaded or extracted + """ + download_dir = request.working_dir / str(asset.id) # if we've previously downloaded the release and still have # wheels laying around, use that cached version instead - if download_dir.exists() or list(download_dir.rglob("*.whl")): - return download_dir + cleanup(download_dir) download_dir.mkdir(parents=True, exist_ok=True) # grab a copy of the complete asset asset_path = download_dir / str(asset.name) - download_url = asset.browser_download_url + + # use the asset URL instead of the browser_download_url to enable + # using auth for private repositories + headers: t.Dict[str, str] = {"Accept": "application/octet-stream"} + + if request.raw_token: + headers["Authorization"] = f"Bearer {request.raw_token}" try: - urlretrieve(download_url, str(asset_path)) - logger.debug(f"Retrieved asset {asset.name} from {download_url}") + # a github asset endpoint causes a redirect. the first request + # receives a pre-signed URL to the asset to pass on to WebTGZ + dl_request = Request(asset.url, headers=headers) + response = urlopen(dl_request) + presigned_url = response.url + + logger.debug(f"Retrieved asset {asset.name} metadata from {asset.url}") except Exception: - logger.exception(f"Unable to download asset from: {download_url}") + logger.exception(f"Unable to download {asset.name} from: {asset.url}") + presigned_url = asset.url # extract the asset - archive = WebTGZ(download_url) - archive.extract(download_dir) + try: + archive = _WebTGZ(presigned_url, headers=headers) + archive.extract(asset_path) + logger.debug(f"Extracted {asset.name} to {download_dir}") + except Exception as ex: + raise SmartSimCLIActionCancelled( + f"Unable to extract {asset.name} from {download_dir}" + ) from ex - logger.debug(f"Extracted {download_url} to {download_dir}") return download_dir -def install_package(asset_dir: pathlib.Path) -> int: +def install_package(request: DragonInstallRequest, asset_dir: pathlib.Path) -> int: """Install the package found in `asset_dir` into the current python environment - :param asset_dir: path to a decompressed archive contents for a release asset""" + :param request: details of a request for the installation of the dragon package + :param asset_dir: path to a decompressed archive contents for a release asset + :returns: Integer return code, 0 for success, non-zero on failures""" found_wheels = list(asset_dir.rglob("*.whl")) if not found_wheels: logger.error(f"No wheel(s) found for package in {asset_dir}") return 1 - create_dotenv(found_wheels[0].parent) + create_dotenv(found_wheels[0].parent, request.pkg_version) try: wheels = list(map(str, found_wheels)) - logger.info("Installing packages:\n%s", "\n".join(wheels)) - - pip("install", *wheels) + for wheel_path in wheels: + logger.info(f"Installing package: {wheel_path}") + pip("install", wheel_path) except Exception: logger.error(f"Unable to install from {asset_dir}") return 1 @@ -214,36 +362,83 @@ def cleanup( """Delete the downloaded asset and any files extracted during installation :param archive_path: path to a downloaded archive for a release asset""" - if archive_path: - archive_path.unlink(missing_ok=True) - logger.debug(f"Deleted archive: {archive_path}") + if not archive_path: + return + + if archive_path.exists() and archive_path.is_file(): + archive_path.unlink() + archive_path = archive_path.parent + + if archive_path.exists() and archive_path.is_dir(): + shutil.rmtree(archive_path, ignore_errors=True) + logger.debug(f"Deleted temporary files in: {archive_path}") -def install_dragon(extraction_dir: t.Union[str, os.PathLike[str]]) -> int: +def install_dragon(request: DragonInstallRequest) -> int: """Retrieve a dragon runtime appropriate for the current platform and install to the current python environment - :param extraction_dir: path for download and extraction of assets + + :param request: details of a request for the installation of the dragon package :returns: Integer return code, 0 for success, non-zero on failures""" if sys.platform == "darwin": logger.debug(f"Dragon not supported on platform: {sys.platform}") return 1 - extraction_dir = pathlib.Path(extraction_dir) - filename: t.Optional[pathlib.Path] = None asset_dir: t.Optional[pathlib.Path] = None try: - asset_info = retrieve_asset_info() - asset_dir = retrieve_asset(extraction_dir, asset_info) + asset_info = retrieve_asset_info(request) + if asset_info is not None: + asset_dir = retrieve_asset(request, asset_info) + return install_package(request, asset_dir) - return install_package(asset_dir) + except SmartSimCLIActionCancelled as ex: + logger.warning(*ex.args) except Exception as ex: - logger.error("Unable to install dragon runtime", exc_info=ex) - finally: - cleanup(filename) + logger.error("Unable to install dragon runtime", exc_info=True) return 2 +def display_post_install_logs() -> None: + """Display post-installation instructions for the user""" + + examples = { + "ofi-include": "/opt/cray/include", + "ofi-build-lib": "/opt/cray/lib64", + "ofi-runtime-lib": "/opt/cray/lib64", + } + + config = ":".join(f"{k}={v}" for k, v in examples.items()) + example_msg1 = f"dragon-config -a \\" + example_msg2 = f' "{config}"' + + logger.info( + "************************** Dragon Package Installed *****************************" + ) + logger.info("To enable Dragon to use HSTA (default: TCP), configure the following:") + + for key in examples: + logger.info(f"\t{key}") + + logger.info("Example:") + logger.info(example_msg1) + logger.info(example_msg2) + logger.info( + "*********************************************************************************" + ) + + if __name__ == "__main__": - sys.exit(install_dragon(CONFIG.core_path / ".dragon")) + # path for download and extraction of assets + extraction_dir = CONFIG.core_path / ".dragon" + dragon_repo = DEFAULT_DRAGON_REPO + dragon_version = DEFAULT_DRAGON_VERSION + + request = DragonInstallRequest( + extraction_dir, + dragon_repo, + dragon_version, + ) + + sys.exit(install_dragon(request)) diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py index e41fe2342d..ae1d47c79f 100644 --- a/smartsim/_core/_install/builder.py +++ b/smartsim/_core/_install/builder.py @@ -40,13 +40,13 @@ import tarfile import tempfile import typing as t -import urllib.request import zipfile from abc import ABC, abstractmethod from dataclasses import dataclass from pathlib import Path from shutil import which from subprocess import SubprocessError +from urllib.request import build_opener, install_opener, urlretrieve # NOTE: This will be imported by setup.py and hence no smartsim related # items should be imported into this file. @@ -795,50 +795,101 @@ def __place_for_rai__(self, target: _PathLike) -> Path: class _WebArchive(_WebLocation): + """Used to download a remote resource""" + + def __init__(self, headers: t.Optional[t.Dict[str, str]] = None) -> None: + """Initialize the instance""" + self._headers: t.Dict[str, str] = headers or {} + @property def name(self) -> str: - _, name = self.url.rsplit("/", 1) + """Return the resource name identified by the URL.""" + # omit the querystring to find the resource name + addressparts = self.url.split("?", maxsplit=1) + address = addressparts[0] + _, name = address.rsplit("/", 1) return name def download(self, target: _PathLike) -> Path: + """Retrieve the remote file + + :param target: The desired target path for writing the downloaded file + :returns: The path to the downloaded file""" target = Path(target) if target.is_dir(): target = target / self.name - file, _ = urllib.request.urlretrieve(self.url, target) - return Path(file).resolve() + + if hasattr(self, "_headers") and self._headers: + opener = build_opener() + opener.addheaders = list(self._headers.items()) + install_opener(opener) + + try: + file, _ = urlretrieve(self.url, target) + return Path(file).resolve() + finally: + opener = build_opener() + install_opener(opener) class _ExtractableWebArchive(_WebArchive, ABC): + """Abstract base class for implementing download and + extraction of a remote archive file""" + @abstractmethod - def _extract_download(self, download_path: Path, target: _PathLike) -> None: ... + def _extract_download(self, download_path: Path, target: _PathLike) -> None: + """Called during file handling to perform format-specific extraction + operations. Must be overridden in child classes + + :param download_path: Path to the downloaded archive file + :param target: Desired target location for extraction""" def extract(self, target: _PathLike) -> None: + """Extract the downloaded file into the desired target location""" with tempfile.TemporaryDirectory() as tmp_dir: arch_path = self.download(tmp_dir) self._extract_download(arch_path, target) class _WebTGZ(_ExtractableWebArchive): + """Performs download and extraction of a remote archive file + in the `.tar.gz` format.""" + + def __init__(self, url: str, headers: t.Optional[t.Dict[str, str]] = None) -> None: + """Initialize the instance + + :param url: URL pointing to a .tar.gz file + :param headers: Additional headers required to download the file""" + super().__init__(headers) + self._url = url + + @property + def url(self) -> str: + """Returns the url that was downloaded""" + return self._url + def _extract_download(self, download_path: Path, target: _PathLike) -> None: + """Called during file handling to perform extraction of `.tar.gz` files + + :param download_path: Path to the downloaded archive file + :param target: Desired target location for extraction""" with tarfile.open(download_path, "r") as tgz_file: tgz_file.extractall(target) class _WebZip(_ExtractableWebArchive): + """Performs download and extraction of a remote archive file + in the `.zip` format.""" + def _extract_download(self, download_path: Path, target: _PathLike) -> None: + """Called during file handling to perform extraction of `.zip` files + + :param download_path: Path to the downloaded archive file + :param target: Desired target location for extraction""" with zipfile.ZipFile(download_path, "r") as zip_file: zip_file.extractall(target) -class WebTGZ(_WebTGZ): - def __init__(self, url: str) -> None: - self._url = url - - @property - def url(self) -> str: - return self._url - - @dataclass(frozen=True) class _PTArchive(_WebZip, _RAIBuildDependency): architecture: Architecture diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 98e895a7d0..2dbda1b2b3 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -296,10 +296,6 @@ def smartsim_key_path(self) -> str: default_path = Path.home() / ".smartsim" / "keys" return os.environ.get("SMARTSIM_KEY_PATH", str(default_path)) - @property - def dragon_pin(self) -> str: - return "0.9" - @lru_cache(maxsize=128, typed=False) def get_config() -> Config: diff --git a/tests/dragon/channel.py b/tests/dragon/channel.py new file mode 100644 index 0000000000..2348784236 --- /dev/null +++ b/tests/dragon/channel.py @@ -0,0 +1,127 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import base64 +import pathlib +import threading +import typing as t + +from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class FileSystemCommChannel(CommChannelBase): + """Passes messages by writing to a file""" + + def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: + """Initialize the FileSystemCommChannel instance + + :param key: a path to the root directory of the feature store""" + self._lock = threading.RLock() + if isinstance(key, pathlib.Path): + super().__init__(key.as_posix().encode("utf-8")) + self._file_path = key + else: + super().__init__(key) + self._file_path = pathlib.Path(key.decode("utf-8")) + + if not self._file_path.parent.exists(): + self._file_path.parent.mkdir(parents=True) + + self._file_path.touch() + + def send(self, value: bytes, timeout: float = 0) -> None: + """Send a message throuh the underlying communication channel + + :param timeout: maximum time to wait (in seconds) for messages to send + :param value: The value to send""" + with self._lock: + # write as text so we can add newlines as delimiters + with open(self._file_path, "a") as fp: + encoded_value = base64.b64encode(value).decode("utf-8") + fp.write(f"{encoded_value}\n") + logger.debug(f"FileSystemCommChannel {self._file_path} sent message") + + def recv(self, timeout: float = 0) -> t.List[bytes]: + """Receives message(s) through the underlying communication channel + + :param timeout: maximum time to wait (in seconds) for messages to arrive + :returns: the received message + :raises SmartSimError: if the descriptor points to a missing file""" + with self._lock: + messages: t.List[bytes] = [] + if not self._file_path.exists(): + raise SmartSimError("Empty channel") + + # read as text so we can split on newlines + with open(self._file_path, "r") as fp: + lines = fp.readlines() + + if lines: + line = lines.pop(0) + event_bytes = base64.b64decode(line.encode("utf-8")) + messages.append(event_bytes) + + self.clear() + + # remove the first message only, write remainder back... + if len(lines) > 0: + with open(self._file_path, "w") as fp: + fp.writelines(lines) + + logger.debug( + f"FileSystemCommChannel {self._file_path} received message" + ) + + return messages + + def clear(self) -> None: + """Create an empty file for events""" + if self._file_path.exists(): + self._file_path.unlink() + self._file_path.touch() + + @classmethod + def from_descriptor( + cls, + descriptor: t.Union[str, bytes], + ) -> "FileSystemCommChannel": + """A factory method that creates an instance from a descriptor string + + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached FileSystemCommChannel""" + try: + if isinstance(descriptor, str): + path = pathlib.Path(descriptor) + else: + path = pathlib.Path(descriptor.decode("utf-8")) + return FileSystemCommChannel(path) + except: + logger.warning(f"failed to create fs comm channel: {descriptor}") + raise diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py index 3c10319f81..932e734c8a 100644 --- a/tests/dragon/test_featurestore_base.py +++ b/tests/dragon/test_featurestore_base.py @@ -43,8 +43,9 @@ ) from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys from smartsim.error import SmartSimError -from tests.mli.channel import FileSystemCommChannel -from tests.mli.feature_store import MemoryFeatureStore + +from .channel import FileSystemCommChannel +from .feature_store import MemoryFeatureStore if t.TYPE_CHECKING: import conftest diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py index 4bf589ad4c..7b678239a0 100644 --- a/tests/test_dragon_installer.py +++ b/tests/test_dragon_installer.py @@ -31,12 +31,16 @@ from collections import namedtuple import pytest +from github.GitRelease import GitRelease from github.GitReleaseAsset import GitReleaseAsset from github.Requester import Requester import smartsim import smartsim._core.utils.helpers as helpers from smartsim._core._cli.scripts.dragon_install import ( + DEFAULT_DRAGON_REPO, + DEFAULT_DRAGON_VERSION, + DragonInstallRequest, cleanup, create_dotenv, install_dragon, @@ -44,7 +48,7 @@ retrieve_asset, retrieve_asset_info, ) -from smartsim._core._install.builder import WebTGZ +from smartsim._core._install.builder import _WebTGZ from smartsim.error.errors import SmartSimCLIActionCancelled # The tests in this file belong to the group_a group @@ -136,6 +140,35 @@ def test_assets(monkeypatch: pytest.MonkeyPatch) -> t.Dict[str, GitReleaseAsset] return assets +@pytest.fixture +def test_releases(monkeypatch: pytest.MonkeyPatch) -> t.Dict[str, GitRelease]: + requester = Requester( + auth=None, + base_url="https://github.com", + user_agent="mozilla", + per_page=10, + verify=False, + timeout=1, + retry=1, + pool_size=1, + ) + headers = {"mock-header": "mock-value"} + attributes = {"title": "mock-title"} + completed = True + + releases: t.List[GitRelease] = [] + + for python_version in ["py3.9", "py3.10", "py3.11"]: + for dragon_version in ["dragon-0.8", "dragon-0.9", "dragon-0.10"]: + attributes = { + "title": f"{python_version}-{dragon_version}-release", + "tag_name": f"v{dragon_version}-weekly", + } + releases.append(GitRelease(requester, headers, attributes, completed)) + + return releases + + def test_cleanup_no_op(archive_path: pathlib.Path) -> None: """Ensure that the cleanup method doesn't bomb when called with missing archive path; simulate a failed download""" @@ -156,62 +189,6 @@ def test_cleanup_archive_exists(test_archive: pathlib.Path) -> None: assert not test_archive.exists() -def test_retrieve_cached( - test_dir: str, - # archive_path: pathlib.Path, - test_archive: pathlib.Path, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Verify that a previously retrieved asset archive is re-used and the - release asset retrieval is not attempted""" - - asset_id = 123 - - def mock_webtgz_extract(self_, target_) -> None: - mock_extraction_dir = pathlib.Path(target_) - with tarfile.TarFile.open(test_archive) as tar: - tar.extractall(mock_extraction_dir) - - # we'll use the mock extract to create the files that would normally be downloaded - expected_output_dir = test_archive.parent / str(asset_id) - mock_webtgz_extract(None, expected_output_dir) - - # get modification time of directory holding the "downloaded" archive - ts1 = expected_output_dir.stat().st_ctime - - requester = Requester( - auth=None, - base_url="https://github.com", - user_agent="mozilla", - per_page=10, - verify=False, - timeout=1, - retry=1, - pool_size=1, - ) - headers = {"mock-header": "mock-value"} - attributes = {"mock-attr": "mock-attr-value"} - completed = True - - asset = GitReleaseAsset(requester, headers, attributes, completed) - - # ensure mocked asset has values that we use... - monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo")) - monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name)) - monkeypatch.setattr(asset, "_id", _git_attr(value=asset_id)) - - # show that retrieving an asset w/a different ID results in ignoring - # other wheels from prior downloads in the parent directory of the asset - asset_path = retrieve_asset(test_archive.parent, asset) - ts2 = asset_path.stat().st_ctime - - # NOTE: the file should be written to a subdir based on the asset ID - assert ( - asset_path == expected_output_dir - ) # shows that the expected path matches the output path - assert ts1 == ts2 # show that the file wasn't changed... - - def test_retrieve_updated( test_archive: pathlib.Path, monkeypatch: pytest.MonkeyPatch, @@ -222,7 +199,7 @@ def test_retrieve_updated( old_asset_id = 100 asset_id = 123 - def mock_webtgz_extract(self_, target_) -> None: + def mock__WebTGZ_extract(self_, target_) -> None: mock_extraction_dir = pathlib.Path(target_) with tarfile.TarFile.open(test_archive) as tar: tar.extractall(mock_extraction_dir) @@ -230,7 +207,7 @@ def mock_webtgz_extract(self_, target_) -> None: # we'll use the mock extract to create the files that would normally be downloaded expected_output_dir = test_archive.parent / str(asset_id) old_output_dir = test_archive.parent / str(old_asset_id) - mock_webtgz_extract(None, old_output_dir) + mock__WebTGZ_extract(None, old_output_dir) requester = Requester( auth=None, @@ -253,13 +230,14 @@ def mock_webtgz_extract(self_, target_) -> None: monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name)) monkeypatch.setattr(asset, "_id", _git_attr(value=asset_id)) monkeypatch.setattr( - WebTGZ, + _WebTGZ, "extract", - lambda s_, t_: mock_webtgz_extract(s_, expected_output_dir), + lambda s_, t_: mock__WebTGZ_extract(s_, expected_output_dir), ) # mock the retrieval of the updated archive # tell it to retrieve. it should return the path to the new download, not the old one - asset_path = retrieve_asset(test_archive.parent, asset) + request = DragonInstallRequest(test_archive.parent) + asset_path = retrieve_asset(request, asset) # sanity check we don't have the same paths assert old_output_dir != expected_output_dir @@ -298,11 +276,13 @@ def mock_webtgz_extract(self_, target_) -> None: ) def test_retrieve_asset_info( test_assets: t.Collection[GitReleaseAsset], + test_releases: t.Collection[GitRelease], monkeypatch: pytest.MonkeyPatch, dragon_pin: str, pyv: str, is_found: bool, is_crayex: bool, + test_dir: str, ) -> None: """Verify that an information is retrieved correctly based on the python version, platform (e.g. CrayEX, !CrayEx), and target dragon pin""" @@ -318,20 +298,23 @@ def test_retrieve_asset_info( "is_crayex_platform", lambda: is_crayex, ) + # avoid hitting github API ctx.setattr( smartsim._core._cli.scripts.dragon_install, - "dragon_pin", - lambda: dragon_pin, + "_get_all_releases", + lambda x: test_releases, ) # avoid hitting github API ctx.setattr( smartsim._core._cli.scripts.dragon_install, "_get_release_assets", - lambda: test_assets, + lambda x: test_assets, ) + request = DragonInstallRequest(test_dir, version=dragon_pin) + if is_found: - chosen_asset = retrieve_asset_info() + chosen_asset = retrieve_asset_info(request) assert chosen_asset assert pyv in chosen_asset.name @@ -343,7 +326,7 @@ def test_retrieve_asset_info( assert "crayex" not in chosen_asset.name.lower() else: with pytest.raises(SmartSimCLIActionCancelled): - retrieve_asset_info() + retrieve_asset_info(request) def test_check_for_utility_missing(test_dir: str) -> None: @@ -441,11 +424,12 @@ def mock_util_check(util: str) -> bool: assert is_cray == platform_result -def test_install_package_no_wheel(extraction_dir: pathlib.Path): +def test_install_package_no_wheel(test_dir: str, extraction_dir: pathlib.Path): """Verify that a missing wheel does not blow up and has a failure retcode""" exp_path = extraction_dir + request = DragonInstallRequest(test_dir) - result = install_package(exp_path) + result = install_package(request, exp_path) assert result != 0 @@ -454,7 +438,9 @@ def test_install_macos(monkeypatch: pytest.MonkeyPatch, extraction_dir: pathlib. with monkeypatch.context() as ctx: ctx.setattr(sys, "platform", "darwin") - result = install_dragon(extraction_dir) + request = DragonInstallRequest(extraction_dir) + + result = install_dragon(request) assert result == 1 @@ -471,7 +457,7 @@ def test_create_dotenv(monkeypatch: pytest.MonkeyPatch, test_dir: str): # ensure no .env exists before trying to create it. assert not exp_env_path.exists() - create_dotenv(mock_dragon_root) + create_dotenv(mock_dragon_root, DEFAULT_DRAGON_VERSION) # ensure the .env is created as side-effect of create_dotenv assert exp_env_path.exists() @@ -493,7 +479,7 @@ def test_create_dotenv_existing_dir(monkeypatch: pytest.MonkeyPatch, test_dir: s # ensure no .env exists before trying to create it. assert not exp_env_path.exists() - create_dotenv(mock_dragon_root) + create_dotenv(mock_dragon_root, DEFAULT_DRAGON_VERSION) # ensure the .env is created as side-effect of create_dotenv assert exp_env_path.exists() @@ -518,7 +504,7 @@ def test_create_dotenv_existing_dotenv(monkeypatch: pytest.MonkeyPatch, test_dir # ensure .env exists so we can update it assert exp_env_path.exists() - create_dotenv(mock_dragon_root) + create_dotenv(mock_dragon_root, DEFAULT_DRAGON_VERSION) # ensure the .env is created as side-effect of create_dotenv assert exp_env_path.exists() @@ -540,7 +526,7 @@ def test_create_dotenv_format(monkeypatch: pytest.MonkeyPatch, test_dir: str): with monkeypatch.context() as ctx: ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) - create_dotenv(mock_dragon_root) + create_dotenv(mock_dragon_root, DEFAULT_DRAGON_VERSION) # ensure the .env is created as side-effect of create_dotenv content = exp_env_path.read_text(encoding="utf-8") diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index 4fe8bf71b4..37c46a573b 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -37,7 +37,10 @@ import zmq import smartsim._core.config -from smartsim._core._cli.scripts.dragon_install import create_dotenv +from smartsim._core._cli.scripts.dragon_install import ( + DEFAULT_DRAGON_VERSION, + create_dotenv, +) from smartsim._core.config.config import get_config from smartsim._core.launcher.dragon.dragonLauncher import ( DragonConnector, @@ -494,7 +497,7 @@ def test_load_env_env_file_created(monkeypatch: pytest.MonkeyPatch, test_dir: st with monkeypatch.context() as ctx: ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) - create_dotenv(mock_dragon_root) + create_dotenv(mock_dragon_root, DEFAULT_DRAGON_VERSION) dragon_conf = smartsim._core.config.CONFIG.dragon_dotenv # verify config does exist @@ -517,7 +520,7 @@ def test_load_env_cached_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): with monkeypatch.context() as ctx: ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) - create_dotenv(mock_dragon_root) + create_dotenv(mock_dragon_root, DEFAULT_DRAGON_VERSION) # load config w/launcher connector = DragonConnector() @@ -541,7 +544,7 @@ def test_merge_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): with monkeypatch.context() as ctx: ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) - create_dotenv(mock_dragon_root) + create_dotenv(mock_dragon_root, DEFAULT_DRAGON_VERSION) # load config w/launcher connector = DragonConnector() From b0b1db661873a8e969944c1263bc0522f401357d Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Wed, 18 Sep 2024 11:43:22 -0400 Subject: [PATCH 28/60] Ensure forks build latest public dragon in CI actions (#706) Fixes bug in build that causes dragon to fail when a PR is started from a fork. - Adds conditional to use dragon nightly repo from main repo only [ committed by @ankona ] [ approved by @al-rigazzi ] --- .github/workflows/run_tests.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index f0b0ba6663..9cc03b5723 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -120,7 +120,11 @@ jobs: env: SMARTSIM_DRAGON_TOKEN: ${{ secrets.DRAGON_TOKEN }} run: | - smart build --device cpu --onnx -v --dragon-repo dragonhpc/dragon-nightly --dragon-version 0.10 + if [ -n "${SMARTSIM_DRAGON_TOKEN}" ]; then + smart build --device cpu --onnx -v --dragon-repo dragonhpc/dragon-nightly --dragon-version 0.10 + else + smart build --device cpu --onnx -v --dragon + fi SP=$(python -c 'import site; print(site.getsitepackages()[0])')/smartsim/_core/config/dragon/.env LLP=$(cat $SP | grep LD_LIBRARY_PATH | awk '{split($0, array, "="); print array[2]}') echo "LD_LIBRARY_PATH=$LLP:$LD_LIBRARY_PATH" >> $GITHUB_ENV From 0ebd5abf2ffc93f29db8cbd3f98ebf7ee9fcd163 Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Thu, 19 Sep 2024 09:25:42 -0700 Subject: [PATCH 29/60] Clean up error handling in MLI (#698) Make error handling correct and consistent throughout the MLI. [ committed by @AlyssaCote ] [ reviewed by @al-rigazzi ] --- doc/changelog.md | 1 + smartsim/_core/mli/comm/channel/channel.py | 1 + .../_core/mli/comm/channel/dragon_channel.py | 19 ++- smartsim/_core/mli/comm/channel/dragon_fli.py | 27 +++- .../infrastructure/control/device_manager.py | 14 +- .../infrastructure/control/error_handling.py | 13 +- .../control/request_dispatcher.py | 6 +- .../infrastructure/control/worker_manager.py | 10 +- .../storage/backbone_feature_store.py | 2 +- .../storage/dragon_feature_store.py | 6 +- .../infrastructure/storage/feature_store.py | 1 - .../mli/infrastructure/worker/torch_worker.py | 130 +++++++++++++----- .../_core/mli/infrastructure/worker/worker.py | 28 +++- smartsim/_core/mli/message_handler.py | 55 ++++++-- tests/dragon/test_environment_loader.py | 3 +- tests/dragon/test_error_handling.py | 17 ++- tests/dragon/test_reply_building.py | 3 + tests/test_message_handler/test_request.py | 13 ++ tests/test_message_handler/test_response.py | 13 ++ 19 files changed, 274 insertions(+), 88 deletions(-) diff --git a/doc/changelog.md b/doc/changelog.md index e56a911d0e..f819122a4b 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Update error handling for consistency - Parameterize installation of dragon package with `smart build` - Update docstrings - Implement asynchronous notifications for shared data diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index bfe7920891..9a12e4c8dc 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -49,6 +49,7 @@ def send(self, value: bytes, timeout: float = 0) -> None: :param timeout: Maximum time to wait (in seconds) for messages to send :param value: The value to send + :raises SmartSimError: If sending message fails """ @abstractmethod diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index 710134357c..1363c0d675 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -53,6 +53,7 @@ def create_local(capacity: int = 0) -> dch.Channel: :param capacity: The number of events the channel can buffer; uses the default buffer size `DEFAULT_CHANNEL_BUFFER_SIZE` when not supplied :returns: The instantiated channel + :raises SmartSimError: If unable to attach local channel """ pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd)) channel: t.Optional[dch.Channel] = None @@ -73,12 +74,12 @@ def create_local(capacity: int = 0) -> dch.Channel: logger.debug( f"Channel {cid} created in pool {pool.serialize()} w/capacity {capacity}" ) - except Exception: + except Exception as e: if offset < 100: - logger.warning(f"Unable to attach to channnel id {cid}. Retrying...") + logger.warning(f"Unable to attach to channel id {cid}. Retrying...") else: logger.error(f"All attempts to attach local channel have failed") - raise + raise SmartSimError("Failed to attach local channel") from e return channel @@ -109,10 +110,16 @@ def send(self, value: bytes, timeout: float = 0.001) -> None: :param value: The value to send :param timeout: Maximum time to wait (in seconds) for messages to send + :raises SmartSimError: If sending message fails """ - with self._channel.sendh(timeout=timeout) as sendh: - sendh.send_bytes(value) - logger.debug(f"DragonCommChannel {self.descriptor!r} sent message") + try: + with self._channel.sendh(timeout=timeout) as sendh: + sendh.send_bytes(value) + logger.debug(f"DragonCommChannel {self.descriptor!r} sent message") + except Exception as e: + raise SmartSimError( + f"Error sending message: DragonCommChannel {self.descriptor!r}" + ) from e def recv(self, timeout: float = 0.001) -> t.List[bytes]: """Receives message(s) through the underlying communication channel. diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index 12ae727af7..84d809c8ac 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -39,6 +39,7 @@ import smartsim._core.mli.comm.channel.channel as cch from smartsim._core.mli.comm.channel.dragon_channel import create_local +from smartsim.error.errors import SmartSimError from smartsim.log import get_logger logger = get_logger(__name__) @@ -70,16 +71,23 @@ def send(self, value: bytes, timeout: float = 0.001) -> None: :param timeout: Maximum time to wait (in seconds) for messages to send :param value: The value to send + :raises SmartSimError: If sending message fails """ - with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: - sendh.send_bytes(value, timeout=timeout) - logger.debug(f"DragonFLIChannel {self.descriptor!r} sent message") + try: + with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: + sendh.send_bytes(value, timeout=timeout) + logger.debug(f"DragonFLIChannel {self.descriptor!r} sent message") + except Exception as e: + raise SmartSimError( + f"Error sending message: DragonFLIChannel {self.descriptor!r}" + ) from e def recv(self, timeout: float = 0.001) -> t.List[bytes]: """Receives message(s) through the underlying communication channel. :param timeout: Maximum time to wait (in seconds) for messages to arrive :returns: The received message(s) + :raises SmartSimError: If receiving message(s) fails """ messages = [] eot = False @@ -93,6 +101,10 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]: ) except fli.FLIEOT: eot = True + except Exception as e: + raise SmartSimError( + f"Error receiving messages: DragonFLIChannel {self.descriptor!r}" + ) from e return messages @classmethod @@ -104,13 +116,14 @@ def from_descriptor( :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached DragonFLIChannel - :raises Exception: If creation of DragonFLIChanenel fails + :raises SmartSimError: If creation of DragonFLIChanenel fails """ try: return DragonFLIChannel( fli_desc=base64.b64decode(descriptor), sender_supplied=True, ) - except: - logger.error(f"Error while creating DragonFLIChannel: {descriptor}") - raise + except Exception as e: + raise SmartSimError( + f"Error while creating DragonFLIChannel: {descriptor}" + ) from e diff --git a/smartsim/_core/mli/infrastructure/control/device_manager.py b/smartsim/_core/mli/infrastructure/control/device_manager.py index 10531e701c..9334971f8c 100644 --- a/smartsim/_core/mli/infrastructure/control/device_manager.py +++ b/smartsim/_core/mli/infrastructure/control/device_manager.py @@ -65,16 +65,26 @@ def remove_model(self, key: str) -> None: """Remove the reference to a model loaded on this device. :param key: The key of the model to remove + :raises KeyError: If key does not exist for removal """ - self._models.pop(key) + try: + self._models.pop(key) + except KeyError: + logger.warning(f"An unknown key was requested for removal: {key}") + raise def get_model(self, key: str) -> t.Any: """Get the model corresponding to a given key. :param key: The model key :returns: The model for the given key + :raises KeyError: If key does not exist """ - return self._models[key] + try: + return self._models[key] + except KeyError: + logger.warning(f"An unknown key was requested: {key}") + raise def __contains__(self, key: str) -> bool: """Check if model with a given key is available on the device. diff --git a/smartsim/_core/mli/infrastructure/control/error_handling.py b/smartsim/_core/mli/infrastructure/control/error_handling.py index 9de97b9b7e..8961cac543 100644 --- a/smartsim/_core/mli/infrastructure/control/error_handling.py +++ b/smartsim/_core/mli/infrastructure/control/error_handling.py @@ -54,7 +54,9 @@ def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: def exception_handler( - exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str + exc: Exception, + reply_channel: t.Optional[CommChannelBase], + failure_message: t.Optional[str], ) -> None: """ Logs exceptions and sends a failure response. @@ -63,12 +65,11 @@ def exception_handler( :param reply_channel: The channel used to send replies :param failure_message: Failure message to log and send back """ - logger.exception( - f"{failure_message}\n" - f"Exception type: {type(exc).__name__}\n" - f"Exception message: {str(exc)}" - ) + logger.exception(exc) if reply_channel: + if failure_message is None: + failure_message = str(exc) + serialized_resp = MessageHandler.serialize_response( build_failure_reply("fail", failure_message) ) diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index 07574b64ab..b4c34db955 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -352,7 +352,7 @@ def _on_iteration(self) -> None: exception_handler( ValueError("No request data found"), None, - "No request data found.", + None, ) request_bytes = bytes_list[0] @@ -371,7 +371,7 @@ def _on_iteration(self) -> None: exception_handler( ValueError("Error validating the request"), request.callback, - "Error validating the request.", + None, ) self._perf_timer.measure_time("validate_request") else: @@ -505,7 +505,7 @@ def flush_requests(self) -> None: exception_handler( exc, None, - "Error Transforming input.", + "Error transforming input.", ) continue diff --git a/smartsim/_core/mli/infrastructure/control/worker_manager.py b/smartsim/_core/mli/infrastructure/control/worker_manager.py index 0dcfc89d59..1840036153 100644 --- a/smartsim/_core/mli/infrastructure/control/worker_manager.py +++ b/smartsim/_core/mli/infrastructure/control/worker_manager.py @@ -175,7 +175,7 @@ def _on_iteration(self) -> None: exception_handler( ValueError("An invalid batch was received"), None, - "Error batching inputs, the batch was invalid.", + None, ) return @@ -230,7 +230,7 @@ def _on_iteration(self) -> None: exception_handler( ValueError("Error batching inputs"), request.callback, - "Error batching inputs.", + None, ) return transformed_input = batch.inputs @@ -241,7 +241,7 @@ def _on_iteration(self) -> None: ) except Exception as e: for request in batch.requests: - exception_handler(e, request.callback, "Failed while executing.") + exception_handler(e, request.callback, "Error while executing.") return self._perf_timer.measure_time("execute") @@ -252,7 +252,7 @@ def _on_iteration(self) -> None: except Exception as e: for request in batch.requests: exception_handler( - e, request.callback, "Failed while transforming the output." + e, request.callback, "Error while transforming the output." ) return @@ -267,7 +267,7 @@ def _on_iteration(self) -> None: ) except Exception as e: exception_handler( - e, request.callback, "Failed while placing the output." + e, request.callback, "Error while placing the output." ) continue else: diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index cda31dde67..b6655bded6 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -413,7 +413,7 @@ def receive( elapsed = (time.time_ns() - start_at) / 1000000000 remaining = elapsed - self._global_timeout if remaining > 0: - logger.debug(f"consumer batch timeout exceeded by: {abs(remaining)}") + logger.debug(f"Consumer batch timeout exceeded by: {abs(remaining)}") break return messages diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py index f1e22e2449..d7b37ffe61 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py @@ -63,7 +63,10 @@ def _get(self, key: str) -> t.Union[str, bytes]: :returns: The value identified by the key :raises KeyError: If the key has not been used to store a value """ - return self._storage[key] + try: + return self._storage[key] + except KeyError as e: + raise KeyError(f"Key not found in FeatureStore: {key}") from e def _set(self, key: str, value: t.Union[str, bytes]) -> None: """Store a value into the underlying storage mechanism. @@ -71,7 +74,6 @@ def _set(self, key: str, value: t.Union[str, bytes]) -> None: :param key: The unique key that identifies the resource :param value: The value to store :returns: The value identified by the key - :raises KeyError: If the key has not been used to store a value """ self._storage[key] = value diff --git a/smartsim/_core/mli/infrastructure/storage/feature_store.py b/smartsim/_core/mli/infrastructure/storage/feature_store.py index ba866d93d1..a55c523058 100644 --- a/smartsim/_core/mli/infrastructure/storage/feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/feature_store.py @@ -159,7 +159,6 @@ def _set(self, key: str, value: t.Union[str, bytes]) -> None: :param key: The unique key that identifies the resource :param value: The value to store - :raises KeyError: If the key has not been used to store a value """ @abstractmethod diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 0639d59696..64e94e5eb6 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -61,6 +61,15 @@ class TorchWorker(MachineLearningWorkerBase): def load_model( batch: RequestBatch, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: + """Given a loaded MachineLearningModel, ensure it is loaded into + device memory. + + :param request: The request that triggered the pipeline + :param device: The device on which the model must be placed + :returns: LoadModelResult wrapping the model loaded for the request + :raises ValueError: If model reference object is not found + :raises RuntimeError: If loading and evaluating the model failed + """ if fetch_result.model_bytes: model_bytes = fetch_result.model_bytes elif batch.raw_model and batch.raw_model.data: @@ -73,9 +82,15 @@ def load_model( device = device.replace(old, new) buffer = io.BytesIO(initial_bytes=model_bytes) - with torch.no_grad(): - model = torch.jit.load(buffer, map_location=device) # type: ignore - model.eval() + try: + with torch.no_grad(): + model = torch.jit.load(buffer, map_location=device) # type: ignore + model.eval() + except Exception as e: + raise RuntimeError( + "Failed to load and evaluate the model: " + f"Model key {batch.model_id.key}, Device {device}" + ) from e result = LoadModelResult(model) return result @@ -85,6 +100,16 @@ def transform_input( fetch_results: list[FetchInputResult], mem_pool: MemoryPool, ) -> TransformInputResult: + """Given a collection of data, perform a transformation on the data and put + the raw tensor data on a MemoryPool allocation. + + :param request: The request that triggered the pipeline + :param fetch_result: Raw outputs from fetching inputs out of a feature store + :param mem_pool: The memory pool used to access batched input tensors + :returns: The transformed inputs wrapped in a TransformInputResult + :raises ValueError: If tensors cannot be reconstructed + :raises IndexError: If index out of range + """ results: list[torch.Tensor] = [] total_samples = 0 slices: list[slice] = [] @@ -123,12 +148,18 @@ def transform_input( alloc_size = int(np.prod(dims) * itemsize) mem_alloc = mem_pool.alloc(alloc_size) mem_view = mem_alloc.get_memview() - mem_view[:alloc_size] = b"".join( - [ - fetch_result.inputs[result_tensor_idx] - for fetch_result in fetch_results - ] - ) + try: + mem_view[:alloc_size] = b"".join( + [ + fetch_result.inputs[result_tensor_idx] + for fetch_result in fetch_results + ] + ) + except IndexError as e: + raise IndexError( + "Error accessing elements in fetch_result.inputs " + f"with index {result_tensor_idx}" + ) from e results.append(mem_alloc.serialize()) @@ -142,6 +173,17 @@ def execute( transform_result: TransformInputResult, device: str, ) -> ExecuteResult: + """Execute an ML model on inputs transformed for use by the model. + + :param batch: The batch of requests that triggered the pipeline + :param load_result: The result of loading the model onto device memory + :param transform_result: The result of transforming inputs for model consumption + :param device: The device on which the model will be executed + :returns: The result of inference wrapped in an ExecuteResult + :raises SmartSimError: If model is not loaded + :raises IndexError: If memory slicing is out of range + :raises ValueError: If tensor creation fails or is unable to evaluate the model + """ if not load_result.model: raise SmartSimError("Model must be loaded to execute") device_to_torch = {"cpu": "cpu", "gpu": "cuda"} @@ -156,26 +198,36 @@ def execute( mem_alloc = MemoryAlloc.attach(transformed) mem_allocs.append(mem_alloc) itemsize = np.empty((1), dtype=dtype).itemsize - tensors.append( - torch.from_numpy( - np.frombuffer( - mem_alloc.get_memview()[0 : np.prod(dims) * itemsize], - dtype=dtype, - ).reshape(dims) + try: + tensors.append( + torch.from_numpy( + np.frombuffer( + mem_alloc.get_memview()[0 : np.prod(dims) * itemsize], + dtype=dtype, + ).reshape(dims) + ) ) - ) + except IndexError as e: + raise IndexError("Error during memory slicing") from e + except Exception as e: + raise ValueError("Error during tensor creation") from e model: torch.nn.Module = load_result.model - with torch.no_grad(): - model.eval() - results = [ - model( - *[ - tensor.to(device, non_blocking=True).detach() - for tensor in tensors - ] - ) - ] + try: + with torch.no_grad(): + model.eval() + results = [ + model( + *[ + tensor.to(device, non_blocking=True).detach() + for tensor in tensors + ] + ) + ] + except Exception as e: + raise ValueError( + f"Error while evaluating the model: Model {batch.model_id.key}" + ) from e transform_result.transformed = [] @@ -189,6 +241,15 @@ def transform_output( batch: RequestBatch, execute_result: ExecuteResult, ) -> list[TransformOutputResult]: + """Given inference results, perform transformations required to + transmit results to the requestor. + + :param batch: The batch of requests that triggered the pipeline + :param execute_result: The result of inference wrapped in an ExecuteResult + :returns: A list of transformed outputs + :raises IndexError: If indexing is out of range + :raises ValueError: If transforming output fails + """ transformed_list: list[TransformOutputResult] = [] cpu_predictions = [ prediction.cpu() for prediction in execute_result.predictions @@ -196,12 +257,19 @@ def transform_output( for result_slice in execute_result.slices: transformed = [] for cpu_item in cpu_predictions: - transformed.append(cpu_item[result_slice].numpy().tobytes()) + try: + transformed.append(cpu_item[result_slice].numpy().tobytes()) - # todo: need the shape from latest schemas added here. - transformed_list.append( - TransformOutputResult(transformed, None, "c", "float32") - ) # fixme + # todo: need the shape from latest schemas added here. + transformed_list.append( + TransformOutputResult(transformed, None, "c", "float32") + ) # fixme + except IndexError as e: + raise IndexError( + f"Error accessing elements: result_slice {result_slice}" + ) from e + except Exception as e: + raise ValueError("Error transforming output") from e execute_result.predictions = [] diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index ad152e5d7e..a91e8bf878 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -400,8 +400,8 @@ def fetch_model( :raises SmartSimError: If neither a key or a model are provided or the model cannot be retrieved from the feature store :raises ValueError: If a feature store is not available and a raw - model is not provided""" - + model is not provided + """ # All requests in the same batch share the model if batch.raw_model: return FetchModelResult(batch.raw_model.data) @@ -435,7 +435,8 @@ def fetch_inputs( :param feature_stores: Available feature stores used for persistence :returns: The fetched input :raises ValueError: If neither an input key or an input tensor are provided - :raises SmartSimError: If a tensor for a given key cannot be retrieved""" + :raises SmartSimError: If a tensor for a given key cannot be retrieved + """ fetch_results = [] for request in batch.requests: if request.raw_inputs: @@ -514,7 +515,10 @@ def load_model( :param request: The request that triggered the pipeline :param device: The device on which the model must be placed - :returns: LoadModelResult wrapping the model loaded for the request""" + :returns: LoadModelResult wrapping the model loaded for the request + :raises ValueError: If model reference object is not found + :raises RuntimeError: If loading and evaluating the model failed + """ @staticmethod @abstractmethod @@ -529,7 +533,10 @@ def transform_input( :param request: The request that triggered the pipeline :param fetch_result: Raw outputs from fetching inputs out of a feature store :param mem_pool: The memory pool used to access batched input tensors - :returns: The transformed inputs wrapped in a TransformInputResult""" + :returns: The transformed inputs wrapped in a TransformInputResult + :raises ValueError: If tensors cannot be reconstructed + :raises IndexError: If index out of range + """ @staticmethod @abstractmethod @@ -545,7 +552,11 @@ def execute( :param load_result: The result of loading the model onto device memory :param transform_result: The result of transforming inputs for model consumption :param device: The device on which the model will be executed - :returns: The result of inference wrapped in an ExecuteResult""" + :returns: The result of inference wrapped in an ExecuteResult + :raises SmartSimError: If model is not loaded + :raises IndexError: If memory slicing is out of range + :raises ValueError: If tensor creation fails or is unable to evaluate the model + """ @staticmethod @abstractmethod @@ -557,4 +568,7 @@ def transform_output( :param batch: The batch of requests that triggered the pipeline :param execute_result: The result of inference wrapped in an ExecuteResult - :returns: A list of transformed outputs""" + :returns: A list of transformed outputs + :raises IndexError: If indexing is out of range + :raises ValueError: If transforming output fails + """ diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index 5b6f846fc8..71def143ad 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -401,8 +401,19 @@ def serialize_request(request: request_capnp.RequestBuilder) -> bytes: :param request: Request to be serialized :returns: Serialized request bytes + :raises ValueError: If serialization fails """ - return request.to_bytes() + display_name = request.schema.node.displayName # type: ignore + class_name = display_name.split(":")[-1] + if class_name != "Request": + raise ValueError( + "Error serializing the request. Value passed in is not " + f"a request: {class_name}" + ) + try: + return request.to_bytes() + except Exception as e: + raise ValueError("Error serializing the request") from e @staticmethod def deserialize_request(request_bytes: bytes) -> request_capnp.Request: @@ -411,13 +422,17 @@ def deserialize_request(request_bytes: bytes) -> request_capnp.Request: :param request_bytes: Bytes to be deserialized into a request :returns: Deserialized request + :raises ValueError: If deserialization fails """ - bytes_message = request_capnp.Request.from_bytes( - request_bytes, traversal_limit_in_words=2**63 - ) + try: + bytes_message = request_capnp.Request.from_bytes( + request_bytes, traversal_limit_in_words=2**63 + ) - with bytes_message as message: - return message + with bytes_message as message: + return message + except Exception as e: + raise ValueError("Error deserializing the request") from e @staticmethod def _assign_status( @@ -552,8 +567,19 @@ def serialize_response(response: response_capnp.ResponseBuilder) -> bytes: :param response: Response to be serialized :returns: Serialized response bytes + :raises ValueError: If serialization fails """ - return response.to_bytes() + display_name = response.schema.node.displayName # type: ignore + class_name = display_name.split(":")[-1] + if class_name != "Response": + raise ValueError( + "Error serializing the response. Value passed in is not " + f"a response: {class_name}" + ) + try: + return response.to_bytes() + except Exception as e: + raise ValueError("Error serializing the response") from e @staticmethod def deserialize_response(response_bytes: bytes) -> response_capnp.Response: @@ -562,10 +588,15 @@ def deserialize_response(response_bytes: bytes) -> response_capnp.Response: :param response_bytes: Bytes to be deserialized into a response :returns: Deserialized response + :raises ValueError: If deserialization fails """ - bytes_message = response_capnp.Response.from_bytes( - response_bytes, traversal_limit_in_words=2**63 - ) + try: + bytes_message = response_capnp.Response.from_bytes( + response_bytes, traversal_limit_in_words=2**63 + ) + + with bytes_message as message: + return message - with bytes_message as message: - return message + except Exception as e: + raise ValueError("Error deserializing the response") from e diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index c3331336e5..e9bcc8dfd9 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -39,6 +39,7 @@ from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) +from smartsim.error.errors import SmartSimError # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon @@ -100,7 +101,7 @@ def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): queue_factory=DragonFLIChannel.from_descriptor, ) - with pytest.raises(DragonFLIError): + with pytest.raises(SmartSimError): config.get_queue() diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 7f823a1c43..618b00d87e 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -302,6 +302,10 @@ def mock_stage(*args, **kwargs): monkeypatch.setattr(integrated_worker, stage, mock_stage) mock_reply_fn = MagicMock() + mock_response = MagicMock() + mock_response.schema.node.displayName = "Response" + mock_reply_fn.return_value = mock_response + monkeypatch.setattr( "smartsim._core.mli.infrastructure.control.error_handling.build_failure_reply", mock_reply_fn, @@ -346,14 +350,14 @@ def mock_exception_handler(exc, reply_channel, failure_message): "Error loading model on device or getting device.", id="load model", ), - pytest.param("execute", "Failed while executing.", id="execute"), + pytest.param("execute", "Error while executing.", id="execute"), pytest.param( "transform_output", - "Failed while transforming the output.", + "Error while transforming the output.", id="transform output", ), pytest.param( - "place_output", "Failed while placing the output.", id="place output" + "place_output", "Error while placing the output.", id="place output" ), ], ) @@ -436,7 +440,7 @@ def test_wm_pipeline_stage_errors_handled( ), pytest.param( "transform_input", - "Error Transforming input.", + "Error transforming input.", id="transform input", ), ], @@ -477,6 +481,11 @@ def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch): mock_reply_channel.send = MagicMock() mock_reply_fn = MagicMock() + + mock_response = MagicMock() + mock_response.schema.node.displayName = "Response" + mock_reply_fn.return_value = mock_response + monkeypatch.setattr( "smartsim._core.mli.infrastructure.control.error_handling.build_failure_reply", mock_reply_fn, diff --git a/tests/dragon/test_reply_building.py b/tests/dragon/test_reply_building.py index 7a8e637803..063200dd64 100644 --- a/tests/dragon/test_reply_building.py +++ b/tests/dragon/test_reply_building.py @@ -50,6 +50,9 @@ def test_build_failure_reply(status: "Status", message: str): "Ensures failure replies can be built successfully" response = build_failure_reply(status, message) + display_name = response.schema.node.displayName # type: ignore + class_name = display_name.split(":")[-1] + assert class_name == "Response" assert response.status == status assert response.message == message diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py index 8be9c11a67..7ede41b50d 100644 --- a/tests/test_message_handler/test_request.py +++ b/tests/test_message_handler/test_request.py @@ -434,3 +434,16 @@ def test_serialize_request_successful(req): deserialized = MessageHandler.deserialize_request(serialized) assert deserialized.to_dict() == req.to_dict() + + +def test_serialization_fails(): + with pytest.raises(ValueError): + bad_request = MessageHandler.serialize_request(tensor_1) + + +def test_deserialization_fails(): + with pytest.raises(ValueError): + new_req = torch_direct_request.copy() + req_bytes = MessageHandler.serialize_request(new_req) + req_bytes = req_bytes + b"extra bytes" + deser = MessageHandler.deserialize_request(req_bytes) diff --git a/tests/test_message_handler/test_response.py b/tests/test_message_handler/test_response.py index d6894eb5cc..86774132ec 100644 --- a/tests/test_message_handler/test_response.py +++ b/tests/test_message_handler/test_response.py @@ -176,3 +176,16 @@ def test_serialize_response(response): deserialized = MessageHandler.deserialize_response(serialized) assert deserialized.to_dict() == response.to_dict() + + +def test_serialization_fails(): + with pytest.raises(ValueError): + bad_response = MessageHandler.serialize_response(result_key1) + + +def test_deserialization_fails(): + with pytest.raises(ValueError): + new_resp = torch_direct_response.copy() + resp_bytes = MessageHandler.serialize_response(new_resp) + resp_bytes = resp_bytes + b"extra bytes" + deser = MessageHandler.deserialize_response(resp_bytes) From 733603143a5cadc446562ef9aab1c908192c0092 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 19 Sep 2024 12:38:17 -0500 Subject: [PATCH 30/60] Version used to run benchmarks --- ex/high_throughput_inference/mli_driver.py | 56 +++++++++++----- ex/high_throughput_inference/mock_app.py | 67 ++++++++++--------- .../mock_app_redis.py | 38 +++++++---- ex/high_throughput_inference/redis_driver.py | 37 +++++++--- .../standalone_workermanager.py | 5 +- .../_core/launcher/dragon/dragonBackend.py | 12 +++- smartsim/_core/mli/comm/channel/dragonfli.py | 5 ++ .../control/requestdispatcher.py | 23 +++++-- .../infrastructure/control/workermanager.py | 23 ++++++- smartsim/_core/utils/timings.py | 9 +-- 10 files changed, 191 insertions(+), 84 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 807a70b219..3ff27c3e37 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -1,17 +1,29 @@ -import os +import argparse import base64 -import cloudpickle +import os +import shutil import sys +import time +import typing as t + +import cloudpickle + from smartsim import Experiment from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker -from smartsim.status import TERMINAL_STATUSES from smartsim.settings import DragonRunSettings -import time -import typing as t +from smartsim.status import TERMINAL_STATUSES + +parser = argparse.ArgumentParser("Mock application") +parser.add_argument("--log_max_batchsize", default=8, type=int) +parser.add_argument("--num_nodes_app", default=1, type=int) +args = parser.parse_args() DEVICE = "gpu" -NUM_RANKS = 4 +NUM_RANKS_PER_NODE = 16 +NUM_NODES_APP = args.num_nodes_app NUM_WORKERS = 1 +BATCH_SIZE = 2 +BATCH_TIMEOUT = 0.0 filedir = os.path.dirname(__file__) worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") app_script_name = os.path.join(filedir, "mock_app.py") @@ -21,9 +33,19 @@ os.environ["SMARTSIM_DRAGON_TRANSPORT"] = transport -exp_path = os.path.join(filedir, f"MLI_proto_{transport.upper()}") +exp_path = os.path.join( + filedir, + "benchmark", + f"throughput_n{NUM_NODES_APP}_rpn{NUM_RANKS_PER_NODE}_timeout{BATCH_TIMEOUT}", + f"samples{2**args.log_max_batchsize}", +) +try: + shutil.rmtree(exp_path) + time.sleep(2) +except: + pass os.makedirs(exp_path, exist_ok=True) -exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path) +exp = Experiment("MLI_benchmark", launcher="dragon", exp_path=exp_path) torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii") @@ -36,33 +58,35 @@ "--worker_class", torch_worker_str, "--batch_size", - str(NUM_RANKS//NUM_WORKERS), + str(BATCH_SIZE), "--batch_timeout", - str(0.00), + str(BATCH_TIMEOUT), "--num_workers", - str(NUM_WORKERS) + str(NUM_WORKERS), ], ) aff = [] worker_manager_rs.set_cpu_affinity(aff) - +worker_manager_rs.set_gpu_affinity([0, 1, 2, 3]) +worker_manager_rs.set_hostlist(["pinoak0037"]) worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) app_rs: DragonRunSettings = exp.create_run_settings( sys.executable, - exe_args=[app_script_name, "--device", DEVICE, "--log_max_batchsize", str(6)], + exe_args=[app_script_name, "--device", DEVICE, "--log_max_batchsize", str(args.log_max_batchsize)], ) -app_rs.set_tasks_per_node(NUM_RANKS) - +app_rs.set_tasks_per_node(NUM_RANKS_PER_NODE) +app_rs.set_nodes(NUM_NODES_APP) app = exp.create_model("app", run_settings=app_rs) app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) exp.generate(worker_manager, app, overwrite=True) -exp.start(worker_manager, app, block=False) +exp.start(worker_manager, block=False) +exp.start(app, block=False) while True: if exp.get_status(app)[0] in TERMINAL_STATUSES: diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 517d18fb2f..7e53efce02 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -41,6 +41,9 @@ import os import time import torch +import typing as t + +import warnings from mpi4py import MPI from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( @@ -54,14 +57,13 @@ torch.set_num_threads(1) logger = get_logger("App") -logger.info("Started app") -CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False +warnings.filterwarnings("ignore", "\*The given NumPy array is not writable\*") class ProtoClient: def __init__(self, timing_on: bool): - comm = MPI.COMM_WORLD - rank = comm.Get_rank() + self._comm = MPI.COMM_WORLD + self._rank = self._comm.Get_rank() connect_to_infrastructure() ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] self._ddict = DDict.attach(ddict_str) @@ -77,9 +79,10 @@ def __init__(self, timing_on: bool): self._from_worker_ch_serialized = self._from_worker_ch.serialize() self._to_worker_ch = Channel.make_process_local() - self.perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{rank}_") + self.perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{self._rank}_") + self._num_its: int = 0 - def run_model(self, model: bytes | str, batch: torch.Tensor): + def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor): tensors = [batch.numpy()] self.perf_timer.start_timings("batch_size", batch.shape[0]) built_tensor_desc = MessageHandler.build_tensor_descriptor( @@ -106,6 +109,7 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.perf_timer.measure_time("send_request") for tensor in tensors: to_sendh.send_bytes(tensor.tobytes()) #TODO NOT FAST ENOUGH!!! + # logger.info(f"{self._rank} sent tensors") self.perf_timer.measure_time("send_tensors") with self._from_worker_ch.recvh(timeout=None) as from_recvh: resp = from_recvh.recv_bytes(timeout=None) @@ -124,20 +128,26 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.perf_timer.measure_time("deserialize_tensor") self.perf_timer.end_timings() + self._num_its += 1 + # logger.info(f"{self._rank} got to the barrier {self._num_its}") + self._comm.Barrier() + # time.sleep(0.01) + # logger.info(f"{self._rank} made it past the barrier {self._num_its}") return result def set_model(self, key: str, model: bytes): self._ddict[key] = model - class ResNetWrapper: def __init__(self, name: str, model: str): - self._model = torch.jit.load(model) + self._model = None # torch.jit.load(model) self._name = name - buffer = io.BytesIO() - scripted = torch.jit.trace(self._model, self.get_batch()) - torch.jit.save(scripted, buffer) + + # scripted = torch.jit.trace(self._model, self.get_batch()) + # torch.jit.save(scripted, buffer) + with open(model, "rb") as model_file: + buffer = io.BytesIO(model_file.read()) self._serialized_model = buffer.getvalue() def get_batch(self, batch_size: int = 32): @@ -151,6 +161,10 @@ def model(self): def name(self): return self._name +def log(msg: str, rank: int) -> None: + if rank == 0: + logger.info(msg) + if __name__ == "__main__": parser = argparse.ArgumentParser("Mock application") @@ -161,29 +175,20 @@ def name(self): resnet = ResNetWrapper("resnet50", f"resnet50.{args.device}.pt") client = ProtoClient(timing_on=True) - client.set_model(resnet.name, resnet.model) + if client._rank == 0: + client.set_model(resnet.name, resnet.model) - if CHECK_RESULTS_AND_MAKE_ALL_SLOWER: - # TODO: adapt to non-Nvidia devices - torch_device = args.device.replace("gpu", "cuda") - pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to(torch_device) + MPI.COMM_WORLD.Barrier() - TOTAL_ITERATIONS = 100 + TOTAL_ITERATIONS = 10 - for log2_bsize in range(args.log_max_batchsize+1): + for log2_bsize in range(args.log_max_batchsize, args.log_max_batchsize+1): b_size: int = 2**log2_bsize - logger.info(f"Batch size: {b_size}") - for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)): - logger.info(f"Iteration: {iteration_number}") + log(f"Batch size: {b_size}", client._rank) + for iteration_number in range(TOTAL_ITERATIONS): + # log(f"Iteration: {iteration_number}", client._rank) sample_batch = resnet.get_batch(b_size) remote_result = client.run_model(resnet.name, sample_batch) - logger.info(client.perf_timer.get_last("total_time")) - if CHECK_RESULTS_AND_MAKE_ALL_SLOWER: - local_res = pt_model(sample_batch.to(torch_device)) - err_norm = torch.linalg.vector_norm(torch.flatten(remote_result).to(torch_device)-torch.flatten(local_res), ord=1).cpu() - res_norm = torch.linalg.vector_norm(remote_result, ord=1).item() - local_res_norm = torch.linalg.vector_norm(local_res, ord=1).item() - logger.info(f"Avg norm of error {err_norm.item()/b_size} compared to result norm of {res_norm/b_size}:{local_res_norm/b_size}") - torch.cuda.synchronize() - - client.perf_timer.print_timings(to_file=True) \ No newline at end of file + log(f"Completed iteration: {iteration_number} in {client.perf_timer.get_last('total_time')} seconds", client._rank) + + client.perf_timer.print_timings(to_file=True, to_stdout=client._rank==0) \ No newline at end of file diff --git a/ex/high_throughput_inference/mock_app_redis.py b/ex/high_throughput_inference/mock_app_redis.py index 8978bcea23..9cdb336be2 100644 --- a/ex/high_throughput_inference/mock_app_redis.py +++ b/ex/high_throughput_inference/mock_app_redis.py @@ -38,11 +38,10 @@ class ResNetWrapper(): def __init__(self, name: str, model: str): - self._model = torch.jit.load(model) + self._model = None self._name = name - buffer = io.BytesIO() - scripted = torch.jit.trace(self._model, self.get_batch()) - torch.jit.save(scripted, buffer) + with open(model, "rb") as model_file: + buffer = io.BytesIO(model_file.read()) self._serialized_model = buffer.getvalue() def get_batch(self, batch_size: int=32): @@ -56,6 +55,11 @@ def model(self): def name(self): return self._name + +def log(msg: str, rank: int) -> None: + if rank == 0: + logger.info(msg) + if __name__ == "__main__": comm = MPI.COMM_WORLD @@ -63,28 +67,38 @@ def name(self): parser = argparse.ArgumentParser("Mock application") parser.add_argument("--device", default="cpu") + parser.add_argument("--log_max_batchsize", default=8, type=int) args = parser.parse_args() - resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt") + resnet = ResNetWrapper("resnet50", f"resnet50.{args.device}.pt") client = Client(cluster=False, address=None) - client.set_model(resnet.name, resnet.model, backend='TORCH', device=args.device.upper()) - perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"redis{rank}_") + if rank == 0: + client.set_model(resnet.name, resnet.model, backend='TORCH', device=args.device.upper()) + + comm.Barrier() + + perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=True, prefix=f"redis{rank}_") total_iterations = 100 timings=[] - for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]: - logger.info(f"Batch size: {batch_size}") - for iteration_number in range(total_iterations + int(batch_size==1)): + for log2_bsize in range(args.log_max_batchsize, args.log_max_batchsize+1): + batch_size: int = 2**log2_bsize + log(f"Batch size: {batch_size}", rank) + for iteration_number in range(total_iterations): perf_timer.start_timings("batch_size", batch_size) - logger.info(f"Iteration: {iteration_number}") input_name = f"batch_{rank}" output_name = f"result_{rank}" client.put_tensor(name=input_name, data=resnet.get_batch(batch_size).numpy()) + perf_timer.measure_time("send_request") client.run_model(name=resnet.name, inputs=[input_name], outputs=[output_name]) + perf_timer.measure_time("run_model") result = client.get_tensor(name=output_name) + perf_timer.measure_time("receive_response") perf_timer.end_timings() + comm.Barrier() + log(f"Completed iteration: {iteration_number} in {perf_timer.get_last('total_time')} seconds", rank) - perf_timer.print_timings(True) + perf_timer.print_timings(True, to_stdout=rank==0) diff --git a/ex/high_throughput_inference/redis_driver.py b/ex/high_throughput_inference/redis_driver.py index ff57725d40..0774b88f98 100644 --- a/ex/high_throughput_inference/redis_driver.py +++ b/ex/high_throughput_inference/redis_driver.py @@ -24,29 +24,50 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import argparse import os import sys +import time + from smartsim import Experiment from smartsim.status import TERMINAL_STATUSES -import time DEVICE = "gpu" +NUM_TASKS_PER_NODE = 16 + filedir = os.path.dirname(__file__) app_script_name = os.path.join(filedir, "mock_app_redis.py") model_name = os.path.join(filedir, f"resnet50.{DEVICE}.pt") +parser = argparse.ArgumentParser("Mock application") +parser.add_argument("--num_nodes_app", default=1, type=int) +parser.add_argument("--log_max_batchsize", default=8, type=int) +args = parser.parse_args() + +NUM_NODES = args.num_nodes_app + +exp_path = os.path.join( + filedir, + "benchmark", + f"redis_ai_multi_n{NUM_NODES}_rpn{NUM_TASKS_PER_NODE}", + f"samples{2**args.log_max_batchsize}", +) +try: + shutil.rmtree(exp_path) + time.sleep(2) +except: + pass -exp_path = os.path.join(filedir, "redis_ai_multi") os.makedirs(exp_path, exist_ok=True) exp = Experiment("redis_ai_multi", launcher="slurm", exp_path=exp_path) -db = exp.create_database(interface="hsn0") +db = exp.create_database(interface="hsn0", hosts=["pinoak0036"]) app_rs = exp.create_run_settings( - sys.executable, exe_args = [app_script_name, "--device", DEVICE] - ) -app_rs.set_nodes(1) -app_rs.set_tasks(4) + sys.executable, exe_args=[app_script_name, "--device", DEVICE, "--log_max_batchsize", str(args.log_max_batchsize)] +) +app_rs.set_nodes(NUM_NODES) +app_rs.set_tasks(NUM_NODES * NUM_TASKS_PER_NODE) app = exp.create_model("app", run_settings=app_rs) app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) @@ -63,4 +84,4 @@ break time.sleep(5) -print("Exiting.") \ No newline at end of file +print("Exiting.") diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 0b8c61251b..879567c525 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -172,6 +172,7 @@ def service_as_dragon_proc( batch_size=args.batch_size, config_loader=config_loader, worker_type=arg_worker_type, + mem_pool_size=128*1024**3, ) wms = [] @@ -215,9 +216,11 @@ def service_as_dragon_proc( # TODO: use ProcessGroup and restart=True? all_procs = [dispatcher_proc, *worker_manager_procs] - print(f"Dispatcher proc: {dispatcher_proc}") for proc in all_procs: proc.start() while all(proc.is_alive for proc in all_procs): time.sleep(1) + + for proc in all_procs: + logger.info(f"{proc} is alive: {proc.is_alive}") \ No newline at end of file diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 7526af14ad..e44e305db4 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -522,10 +522,10 @@ def _stop_steps(self) -> None: and proc_group.status == DragonStatus.RUNNING ): try: - proc_group.kill() + proc_group.stop() except dragon_process_group.DragonProcessGroupError: try: - proc_group.stop() + proc_group.kill() except dragon_process_group.DragonProcessGroupError: logger.error("Process group already stopped") redir_group = self._group_infos[step_id].redir_workers @@ -863,6 +863,12 @@ def _(self, request: DragonShutdownRequest) -> DragonShutdownResponse: self._frontend_shutdown = request.frontend_shutdown return DragonShutdownResponse() + def __del__(self) -> None: + try: + self._ddict.destroy() + except Exception: + logger.error("Could not destroy Backbone dictionary") + class DragonBackendView: def __init__(self, backend: DragonBackend) -> None: @@ -945,4 +951,4 @@ def _host_table_line(host: str) -> list[str]: return tabulate( values, headers, disable_numparse=True, tablefmt="github", colalign=colalign - ) + ) \ No newline at end of file diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 130c5cf5eb..ea80e9d19a 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -68,13 +68,18 @@ def recv(self) -> t.List[bytes]: :returns: the received message""" messages = [] eot = False + parts = 0 with self._fli.recvh(timeout=0.001) as recvh: + # print(">>>> FLI IS RECEIVING <<<<", flush=True) while not eot: try: message, _ = recvh.recv_bytes(timeout=None) messages.append(message) + # print(parts, flush=True) + parts += 1 except fli.FLIEOT: eot = True + # print(f"<<<< FLI IS DONE {parts} >>>>", flush=True) return messages @classmethod diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index d56912a8f0..31bea966a3 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -131,9 +131,9 @@ def ready(self) -> bool: return False timed_out = ( - self._batch_timeout > 0 and self._elapsed_time >= self._batch_timeout + self._elapsed_time >= self._batch_timeout ) - logger.debug(f"Is full: {self.full()} or has timed out: {timed_out}") + # logger.info(f"Is full: {self.full()} or has timed out: {timed_out}") return self.full() or timed_out def make_disposable(self) -> None: @@ -225,6 +225,10 @@ def __init__( """Memory pool used to share batched input tensors with the Worker Managers""" self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True) """Performance timer""" + self._processed_requests: int = 0 + """Number of requests processed by this dispatcher""" + self._sent_batches: int = 0 + """Number of batches sent to Worker Managers""" def _check_feature_stores(self, request: InferenceRequest) -> bool: """Ensures that all feature stores required by the request are available @@ -317,20 +321,23 @@ def _on_iteration(self) -> None: """ try: self._perf_timer.set_active(True) + pre_receive = time.perf_counter() bytes_list: t.List[bytes] = self._incoming_channel.recv() except Exception: self._perf_timer.set_active(False) else: + self._processed_requests += 1 + # print(f">>>> PROCESSING REQUEST {self._processed_requests} (free memory: {self._mem_pool.free_space})<<<<") if not bytes_list: exception_handler( ValueError("No request data found"), None, "No request data found.", ) - + post_receive = time.perf_counter() request_bytes = bytes_list[0] tensor_bytes_list = bytes_list[1:] - self._perf_timer.start_timings() + self._perf_timer.start_timings(first_label="receive", first_value=post_receive-pre_receive) request = self._worker.deserialize_message( request_bytes, self._callback_factory @@ -351,14 +358,15 @@ def _on_iteration(self) -> None: self._perf_timer.measure_time("validate_request") self.dispatch(request) self._perf_timer.measure_time("dispatch") + # print(f"<<<< PROCESSED {self._processed_requests} REQUESTS >>>>") finally: self.flush_requests() self.remove_queues() self._perf_timer.end_timings() - if self._perf_timer.max_length == 801 and self._perf_timer.is_active: - self._perf_timer.print_timings(True) + # if self._perf_timer.max_length == 1600 and self._perf_timer.is_active: + # self._perf_timer.print_timings(True) def remove_queues(self) -> None: """Remove references to queues that can be removed @@ -444,6 +452,8 @@ def flush_requests(self) -> None: for queue_list in self._queues.values(): for queue in queue_list: if queue.ready: + self._sent_batches += 1 + # print(f">>>> SENDING {self._sent_batches} BATCH <<<<") self._perf_timer.measure_time("find_queue") try: batch = RequestBatch( @@ -495,6 +505,7 @@ def flush_requests(self) -> None: ) continue self._perf_timer.measure_time("put") + # print(f">>>> SENT {self._sent_batches} BATCHES <<<<") def _can_shutdown(self) -> bool: """Whether the Service can be shut down""" diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 54a245b813..89975e54b0 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -108,6 +108,10 @@ def __init__( """Object responsible for model caching and device access""" self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) """Performance timer""" + self._processed_batches: int = 0 + """Number of processed request batches""" + self._sent_responses: int = 0 + """Number of sent responses""" def _on_start(self) -> None: """Called on initial entry into Service `execute` event loop before @@ -165,9 +169,20 @@ def _on_iteration(self) -> None: pre_batch_time = time.perf_counter() try: - batch: RequestBatch = self._dispatcher_queue.get(timeout=0.0001) + batch: RequestBatch = self._dispatcher_queue.get(timeout=None) except Empty: return + except Exception as exc: + exception_handler( + exc, + None, + "Error receiving batch.", + ) + return + + + self._processed_batches += 1 + # print(f"**** PROCESSING BATCH {self._processed_batches} ****", flush=True) self._perf_timer.start_timings( "flush_requests", time.perf_counter() - pre_batch_time @@ -259,6 +274,7 @@ def _on_iteration(self) -> None: return for request, transformed_output in zip(batch.requests, transformed_outputs): + self._sent_responses += 1 reply = InferenceReply() if request.output_keys: try: @@ -305,9 +321,10 @@ def _on_iteration(self) -> None: self._perf_timer.measure_time("send") self._perf_timer.end_timings() + # print(f"**** PROCESSED {self._processed_batches} BATCHES AND {self._sent_responses} REPLIES ****", flush=True) - if self._perf_timer.max_length == 801: - self._perf_timer.print_timings(True) + # if self._perf_timer.max_length == 1600: + # self._perf_timer.print_timings(True) def _can_shutdown(self) -> bool: """Return true when the criteria to shut down the service are met.""" diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py index a61a243220..b890153891 100644 --- a/smartsim/_core/utils/timings.py +++ b/smartsim/_core/utils/timings.py @@ -68,7 +68,7 @@ def start_timings( if first_label is not None and first_value is not None: mod_label = self._make_label(first_label) value = self._format_number(first_value) - self._log(f"Started timing: {first_label}: {value}") + self._log(f"Started timing: {mod_label}: {value}") self._add_label_to_timings(mod_label) self._timings[mod_label].append(value) self._start = time.perf_counter() @@ -119,15 +119,16 @@ def max_length(self) -> int: return 0 return max(len(value) for value in self._timings.values()) - def print_timings(self, to_file: bool = False) -> None: - print(" ".join(self._timings.keys())) + def print_timings(self, to_file: bool = False, to_stdout: bool = True) -> None: + if to_stdout: + print(" ".join(self._timings.keys())) try: value_array = np.array(list(self._timings.values()), dtype=float) except Exception as e: logger.exception(e) return value_array = np.transpose(value_array) - if self._debug: + if self._debug and to_stdout: for i in range(value_array.shape[0]): print(" ".join(self._format_number(value) for value in value_array[i])) if to_file: From d43f7c7cb979cb8ee66376ecef8f6b3e4c0fcbc8 Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Thu, 19 Sep 2024 16:21:45 -0700 Subject: [PATCH 31/60] MLI helper methods (#709) Helper methods added to InferenceReply and InferenceRequest. [ committed by @AlyssaCote ] [ reviewed by @al-rigazzi ] --- doc/changelog.md | 1 + .../control/request_dispatcher.py | 20 ++- .../infrastructure/control/worker_manager.py | 22 ++-- .../_core/mli/infrastructure/worker/worker.py | 76 ++++++++++- tests/dragon/test_error_handling.py | 6 + tests/dragon/test_inference_reply.py | 76 +++++++++++ tests/dragon/test_inference_request.py | 118 ++++++++++++++++++ 7 files changed, 303 insertions(+), 16 deletions(-) create mode 100644 tests/dragon/test_inference_reply.py create mode 100644 tests/dragon/test_inference_request.py diff --git a/doc/changelog.md b/doc/changelog.md index f819122a4b..45f8e4005e 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Add helper methods to MLI classes - Update error handling for consistency - Parameterize installation of dragon package with `smart build` - Update docstrings diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index b4c34db955..1c3b0f4c85 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -253,6 +253,14 @@ def __init__( self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True) """Performance timer""" + @property + def has_featurestore_factory(self) -> bool: + """Check if the RequestDispatcher has a FeatureStore factory. + + :returns: True if there is a FeatureStore factory, False otherwise + """ + return self._featurestore_factory is not None + def _check_feature_stores(self, request: InferenceRequest) -> bool: """Ensures that all feature stores required by the request are available. @@ -272,7 +280,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool: fs_actual = {item.descriptor for item in self._feature_stores.values()} fs_missing = fs_desired - fs_actual - if self._featurestore_factory is None: + if self.has_featurestore_factory: logger.error("No feature store factory configured") return False @@ -292,7 +300,7 @@ def _check_model(self, request: InferenceRequest) -> bool: :param request: The request to validate :returns: False if model validation fails for the request, True otherwise """ - if request.model_key or request.raw_model: + if request.has_model_key or request.has_raw_model: return True logger.error("Unable to continue without model bytes or feature store key") @@ -305,7 +313,7 @@ def _check_inputs(self, request: InferenceRequest) -> bool: :param request: The request to validate :returns: False if input validation fails for the request, True otherwise """ - if request.input_keys or request.raw_inputs: + if request.has_input_keys or request.has_raw_inputs: return True logger.error("Unable to continue without input bytes or feature store keys") @@ -318,7 +326,7 @@ def _check_callback(self, request: InferenceRequest) -> bool: :param request: The request to validate :returns: False if callback validation fails for the request, True otherwise """ - if request.callback is not None: + if request.callback: return True logger.error("No callback channel provided in request") @@ -362,7 +370,7 @@ def _on_iteration(self) -> None: request = self._worker.deserialize_message( request_bytes, self._callback_factory ) - if request.input_meta and tensor_bytes_list: + if request.has_input_meta and tensor_bytes_list: request.raw_inputs = tensor_bytes_list self._perf_timer.measure_time("deserialize_message") @@ -445,7 +453,7 @@ def dispatch(self, request: InferenceRequest) -> None: :param request: The request to place """ - if request.raw_model is not None: + if request.has_raw_model: logger.debug("Direct inference requested, creating tmp queue") tmp_id = f"_tmp_{str(uuid.uuid4())}" tmp_queue: BatchQueue = BatchQueue( diff --git a/smartsim/_core/mli/infrastructure/control/worker_manager.py b/smartsim/_core/mli/infrastructure/control/worker_manager.py index 1840036153..bf6fddb81d 100644 --- a/smartsim/_core/mli/infrastructure/control/worker_manager.py +++ b/smartsim/_core/mli/infrastructure/control/worker_manager.py @@ -109,6 +109,14 @@ def __init__( self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) """Performance timer""" + @property + def has_featurestore_factory(self) -> bool: + """Check if the WorkerManager has a FeatureStore factory. + + :returns: True if there is a FeatureStore factory, False otherwise + """ + return self._featurestore_factory is not None + def _on_start(self) -> None: """Called on initial entry into Service `execute` event loop before `_on_iteration` is invoked.""" @@ -132,7 +140,7 @@ def _check_feature_stores(self, batch: RequestBatch) -> bool: fs_actual = {item.descriptor for item in self._feature_stores.values()} fs_missing = fs_desired - fs_actual - if self._featurestore_factory is None: + if not self.has_featurestore_factory: logger.error("No feature store factory configured") return False @@ -151,7 +159,7 @@ def _validate_batch(self, batch: RequestBatch) -> bool: :param batch: The batch of requests to validate :returns: False if the request fails any validation checks, True otherwise """ - if batch is None or len(batch.requests) == 0: + if batch is None or not batch.has_valid_requests: return False return self._check_feature_stores(batch) @@ -179,7 +187,7 @@ def _on_iteration(self) -> None: ) return - if self._device_manager is None: + if not self._device_manager: for request in batch.requests: msg = "No Device Manager found. WorkerManager._on_start() " "must be called after initialization. If possible, " @@ -225,7 +233,7 @@ def _on_iteration(self) -> None: return self._perf_timer.measure_time("load_model") - if batch.inputs is None: + if not batch.inputs: for request in batch.requests: exception_handler( ValueError("Error batching inputs"), @@ -258,7 +266,7 @@ def _on_iteration(self) -> None: for request, transformed_output in zip(batch.requests, transformed_outputs): reply = InferenceReply() - if request.output_keys: + if request.has_output_keys: try: reply.output_keys = self._worker.place_output( request, @@ -274,7 +282,7 @@ def _on_iteration(self) -> None: reply.outputs = transformed_output.outputs self._perf_timer.measure_time("assign_output") - if reply.outputs is None or not reply.outputs: + if not reply.has_outputs: response = build_failure_reply("fail", "Outputs not found.") else: reply.status_enum = "complete" @@ -296,7 +304,7 @@ def _on_iteration(self) -> None: if request.callback: request.callback.send(serialized_resp) - if reply.outputs: + if reply.has_outputs: # send tensor data after response for output in reply.outputs: request.callback.send(output) diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index a91e8bf878..530d251540 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -94,6 +94,58 @@ def __init__( self.batch_size = batch_size """The batch size to apply when batching""" + @property + def has_raw_model(self) -> bool: + """Check if the InferenceRequest contains a raw_model. + + :returns: True if raw_model is not None, False otherwise + """ + return self.raw_model is not None + + @property + def has_model_key(self) -> bool: + """Check if the InferenceRequest contains a model_key. + + :returns: True if model_key is not None, False otherwise + """ + return self.model_key is not None + + @property + def has_raw_inputs(self) -> bool: + """Check if the InferenceRequest contains raw_outputs. + + :returns: True if raw_outputs is not None and is not an empty list, + False otherwise + """ + return self.raw_inputs is not None and bool(self.raw_inputs) + + @property + def has_input_keys(self) -> bool: + """Check if the InferenceRequest contains input_keys. + + :returns: True if input_keys is not None and is not an empty list, + False otherwise + """ + return self.input_keys is not None and bool(self.input_keys) + + @property + def has_output_keys(self) -> bool: + """Check if the InferenceRequest contains output_keys. + + :returns: True if output_keys is not None and is not an empty list, + False otherwise + """ + return self.output_keys is not None and bool(self.output_keys) + + @property + def has_input_meta(self) -> bool: + """Check if the InferenceRequest contains input_meta. + + :returns: True if input_meta is not None and is not an empty list, + False otherwise + """ + return self.input_meta is not None and bool(self.input_meta) + class InferenceReply: """Internal representation of the reply to a client request for inference.""" @@ -121,6 +173,24 @@ def __init__( self.message = message """Status message that corresponds with the status enum""" + @property + def has_outputs(self) -> bool: + """Check if the InferenceReply contains outputs. + + :returns: True if outputs is not None and is not an empty list, + False otherwise + """ + return self.outputs is not None and bool(self.outputs) + + @property + def has_output_keys(self) -> bool: + """Check if the InferenceReply contains output_keys. + + :returns: True if output_keys is not None and is not an empty list, + False otherwise + """ + return self.output_keys is not None and bool(self.output_keys) + class LoadModelResult: """A wrapper around a loaded model.""" @@ -372,13 +442,13 @@ def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: information needed in the reply """ prepared_outputs: t.List[t.Any] = [] - if reply.output_keys: + if reply.has_output_keys: for value in reply.output_keys: if not value: continue msg_key = MessageHandler.build_tensor_key(value.key, value.descriptor) prepared_outputs.append(msg_key) - elif reply.outputs: + elif reply.has_outputs: for _ in reply.outputs: msg_tensor_desc = MessageHandler.build_tensor_descriptor( "c", @@ -448,7 +518,7 @@ def fetch_inputs( if not feature_stores: raise ValueError("No input and no feature store provided") - if request.input_keys: + if request.has_input_keys: data: t.List[bytes] = [] for fs_key in request.input_keys: diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 618b00d87e..a5df629545 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -460,6 +460,12 @@ def test_dispatcher_pipeline_stage_errors_handled( mock_reply_fn = mock_pipeline_stage(monkeypatch, integrated_worker, stage) + monkeypatch.setattr( + request_dispatcher, + "_validate_request", + MagicMock(return_value=True), + ) + if stage not in ["fetch_inputs"]: monkeypatch.setattr( integrated_worker, diff --git a/tests/dragon/test_inference_reply.py b/tests/dragon/test_inference_reply.py new file mode 100644 index 0000000000..1eb137ae61 --- /dev/null +++ b/tests/dragon/test_inference_reply.py @@ -0,0 +1,76 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +dragon = pytest.importorskip("dragon") + +from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey +from smartsim._core.mli.infrastructure.worker.worker import InferenceReply +from smartsim._core.mli.message_handler import MessageHandler + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + +handler = MessageHandler() + + +@pytest.fixture +def inference_reply() -> InferenceReply: + return InferenceReply() + + +@pytest.fixture +def fs_key() -> FeatureStoreKey: + return FeatureStoreKey("key", "descriptor") + + +@pytest.mark.parametrize( + "outputs, expected", + [ + ([b"output bytes"], True), + (None, False), + ([], False), + ], +) +def test_has_outputs(monkeypatch, inference_reply, outputs, expected): + """Test the has_outputs property with different values for outputs.""" + monkeypatch.setattr(inference_reply, "outputs", outputs) + assert inference_reply.has_outputs == expected + + +@pytest.mark.parametrize( + "output_keys, expected", + [ + ([fs_key], True), + (None, False), + ([], False), + ], +) +def test_has_output_keys(monkeypatch, inference_reply, output_keys, expected): + """Test the has_output_keys property with different values for output_keys.""" + monkeypatch.setattr(inference_reply, "output_keys", output_keys) + assert inference_reply.has_output_keys == expected diff --git a/tests/dragon/test_inference_request.py b/tests/dragon/test_inference_request.py new file mode 100644 index 0000000000..909d021d6e --- /dev/null +++ b/tests/dragon/test_inference_request.py @@ -0,0 +1,118 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +dragon = pytest.importorskip("dragon") + +from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey +from smartsim._core.mli.infrastructure.worker.worker import InferenceRequest +from smartsim._core.mli.message_handler import MessageHandler + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + +handler = MessageHandler() + + +@pytest.fixture +def inference_request() -> InferenceRequest: + return InferenceRequest() + + +@pytest.fixture +def fs_key() -> FeatureStoreKey: + return FeatureStoreKey("key", "descriptor") + + +@pytest.mark.parametrize( + "raw_model, expected", + [ + (handler.build_model(b"bytes", "Model Name", "V1"), True), + (None, False), + ], +) +def test_has_raw_model(monkeypatch, inference_request, raw_model, expected): + """Test the has_raw_model property with different values for raw_model.""" + monkeypatch.setattr(inference_request, "raw_model", raw_model) + assert inference_request.has_raw_model == expected + + +@pytest.mark.parametrize( + "model_key, expected", + [ + (fs_key, True), + (None, False), + ], +) +def test_has_model_key(monkeypatch, inference_request, model_key, expected): + """Test the has_model_key property with different values for model_key.""" + monkeypatch.setattr(inference_request, "model_key", model_key) + assert inference_request.has_model_key == expected + + +@pytest.mark.parametrize( + "raw_inputs, expected", + [([b"raw input bytes"], True), (None, False), ([], False)], +) +def test_has_raw_inputs(monkeypatch, inference_request, raw_inputs, expected): + """Test the has_raw_inputs property with different values for raw_inputs.""" + monkeypatch.setattr(inference_request, "raw_inputs", raw_inputs) + assert inference_request.has_raw_inputs == expected + + +@pytest.mark.parametrize( + "input_keys, expected", + [([fs_key], True), (None, False), ([], False)], +) +def test_has_input_keys(monkeypatch, inference_request, input_keys, expected): + """Test the has_input_keys property with different values for input_keys.""" + monkeypatch.setattr(inference_request, "input_keys", input_keys) + assert inference_request.has_input_keys == expected + + +@pytest.mark.parametrize( + "output_keys, expected", + [([fs_key], True), (None, False), ([], False)], +) +def test_has_output_keys(monkeypatch, inference_request, output_keys, expected): + """Test the has_output_keys property with different values for output_keys.""" + monkeypatch.setattr(inference_request, "output_keys", output_keys) + assert inference_request.has_output_keys == expected + + +@pytest.mark.parametrize( + "input_meta, expected", + [ + ([handler.build_tensor_descriptor("c", "float32", [1, 2, 3])], True), + (None, False), + ([], False), + ], +) +def test_has_input_meta(monkeypatch, inference_request, input_meta, expected): + """Test the has_input_meta property with different values for input_meta.""" + monkeypatch.setattr(inference_request, "input_meta", input_meta) + assert inference_request.has_input_meta == expected From 5ec287c7915625924d1316795e75419bbca77312 Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Thu, 19 Sep 2024 18:24:07 -0700 Subject: [PATCH 32/60] Bug fix (#715) Fixed bug in _validate. [ committed by @AlyssaCote ] [ reviewed by @ankona ] --- doc/changelog.md | 1 + .../_core/mli/infrastructure/control/request_dispatcher.py | 2 +- tests/dragon/test_error_handling.py | 6 ------ 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/doc/changelog.md b/doc/changelog.md index 45f8e4005e..7d08c9376f 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Quick bug fix in _validate - Add helper methods to MLI classes - Update error handling for consistency - Parameterize installation of dragon package with `smart build` diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index 1c3b0f4c85..67797fe448 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -280,7 +280,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool: fs_actual = {item.descriptor for item in self._feature_stores.values()} fs_missing = fs_desired - fs_actual - if self.has_featurestore_factory: + if not self.has_featurestore_factory: logger.error("No feature store factory configured") return False diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index a5df629545..618b00d87e 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -460,12 +460,6 @@ def test_dispatcher_pipeline_stage_errors_handled( mock_reply_fn = mock_pipeline_stage(monkeypatch, integrated_worker, stage) - monkeypatch.setattr( - request_dispatcher, - "_validate_request", - MagicMock(return_value=True), - ) - if stage not in ["fetch_inputs"]: monkeypatch.setattr( integrated_worker, From ce024335dbd200f24b23afe506dddf6e31ec1372 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 25 Sep 2024 10:34:15 -0500 Subject: [PATCH 33/60] Committing pre-merge --- .../control/request_dispatcher.py | 24 ++++++++++++++----- .../infrastructure/control/worker_manager.py | 3 +-- .../_core/mli/infrastructure/worker/worker.py | 20 +++++++++++++++- 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index b4c34db955..c6b7032091 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -250,7 +250,7 @@ def __init__( """The worker used to batch inputs""" self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(mem_pool_size).sdesc) """Memory pool used to share batched input tensors with the Worker Managers""" - self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True) + self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True) """Performance timer""" def _check_feature_stores(self, request: InferenceRequest) -> bool: @@ -343,8 +343,10 @@ def _on_iteration(self) -> None: """This method is executed repeatedly until ``Service`` shutdown conditions are satisfied and cooldown is elapsed.""" try: + logger.debug("Receiving message") self._perf_timer.is_active = True bytes_list: t.List[bytes] = self._incoming_channel.recv() + logger.debug("Received data") except Exception: self._perf_timer.is_active = False else: @@ -359,12 +361,22 @@ def _on_iteration(self) -> None: tensor_bytes_list = bytes_list[1:] self._perf_timer.start_timings() - request = self._worker.deserialize_message( - request_bytes, self._callback_factory - ) + logger.debug("Deserialzing message") + + try: + request = self._worker.deserialize_message( + request_bytes, self._callback_factory + ) + except Exception as exc: + exception_handler(exc, request.callback, "Error deserializing request") + self._perf_timer.end_timings() + return + logger.debug("Deserialized message") if request.input_meta and tensor_bytes_list: request.raw_inputs = tensor_bytes_list + logger.debug("Assigned data") + self._perf_timer.measure_time("deserialize_message") if not self._validate_request(request): @@ -384,8 +396,8 @@ def _on_iteration(self) -> None: self._perf_timer.end_timings() - if self._perf_timer.max_length == 801 and self._perf_timer.is_active: - self._perf_timer.print_timings(True) + # if self._perf_timer.max_length == 801 and self._perf_timer.is_active: + # self._perf_timer.print_timings(True) def remove_queues(self) -> None: """Remove references to queues that can be removed diff --git a/smartsim/_core/mli/infrastructure/control/worker_manager.py b/smartsim/_core/mli/infrastructure/control/worker_manager.py index 198ee657bb..8e40bd9d75 100644 --- a/smartsim/_core/mli/infrastructure/control/worker_manager.py +++ b/smartsim/_core/mli/infrastructure/control/worker_manager.py @@ -106,7 +106,7 @@ def __init__( information among MLI components""" self._device_manager: t.Optional[DeviceManager] = None """Object responsible for model caching and device access""" - self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) + self._perf_timer = PerfTimer(prefix="w_", debug=True, timing_on=True) """Performance timer""" self._processed_batches: int = 0 """Number of processed request batches""" @@ -178,7 +178,6 @@ def _on_iteration(self) -> None: ) return - self._processed_batches += 1 # print(f"**** PROCESSING BATCH {self._processed_batches} ****", flush=True) diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index a91e8bf878..fa224a51f2 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -125,12 +125,22 @@ def __init__( class LoadModelResult: """A wrapper around a loaded model.""" - def __init__(self, model: t.Any) -> None: + def __init__( + self, + model: t.Any, + inputs: t.Optional[t.List[str]] = None, + outputs: t.Optional[t.List[str]] = None, + ) -> None: """Initialize the LoadModelResult. :param model: The loaded model """ self.model = model + """The model as bytes""" + self.inputs = inputs + """List of input layer names, only used in TensorFlow""" + self.outputs = outputs + """List of output tensor names, only used in TensorFlow""" class TransformInputResult: @@ -321,6 +331,7 @@ def deserialize_message( model_key: t.Optional[FeatureStoreKey] = None model_bytes: t.Optional[Model] = None + logger.debug("Getting key") if request.model.which() == "key": model_key = FeatureStoreKey( key=request.model.key.key, @@ -329,27 +340,34 @@ def deserialize_message( elif request.model.which() == "data": model_bytes = request.model.data + logger.debug("Getting descriptor") callback_key = request.replyChannel.descriptor + logger.debug(f"Callback factory {callback_factory}({callback_key})") comm_channel = callback_factory(callback_key) input_keys: t.Optional[t.List[FeatureStoreKey]] = None input_bytes: t.Optional[t.List[bytes]] = None output_keys: t.Optional[t.List[FeatureStoreKey]] = None input_meta: t.Optional[t.List[TensorDescriptor]] = None + logger.debug("Does it have a key?") if request.input.which() == "keys": + logger.debug("Yeah") input_keys = [ FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor) for value in request.input.keys ] elif request.input.which() == "descriptors": + logger.debug("Not, but it has descriptors") input_meta = request.input.descriptors # type: ignore + logger.debug("Does it have an output?") if request.output: output_keys = [ FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor) for value in request.output ] + logger.debug("Going to build the request, then") inference_request = InferenceRequest( model_key=model_key, callback=comm_channel, From c1c4604e46ba2135cabe889600ac1959845249d6 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 25 Sep 2024 16:48:37 -0500 Subject: [PATCH 34/60] Working after merge --- ex/high_throughput_inference/mli_driver.py | 29 +- ex/high_throughput_inference/mock_app.py | 66 ++-- .../mock_app_redis.py | 42 ++- .../mock_app_tensorflow.py | 206 ++++++++++++ ex/high_throughput_inference/redis_driver.py | 9 +- .../standalone_worker_manager.py | 37 +-- smartsim/_core/entrypoints/service.py | 3 +- .../_core/launcher/dragon/dragonBackend.py | 2 +- .../_core/mli/comm/channel/dragon_channel.py | 5 +- smartsim/_core/mli/comm/channel/dragon_fli.py | 2 +- .../infrastructure/control/device_manager.py | 10 +- .../control/request_dispatcher.py | 5 +- .../infrastructure/control/worker_manager.py | 4 +- .../worker/tensorflow_worker.py | 308 ++++++++++++++++++ .../_core/mli/infrastructure/worker/worker.py | 8 - 15 files changed, 646 insertions(+), 90 deletions(-) create mode 100644 ex/high_throughput_inference/mock_app_tensorflow.py create mode 100644 smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index c385ce4a94..fa56f6ca77 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -10,23 +10,29 @@ from smartsim import Experiment from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker +from smartsim._core.mli.infrastructure.worker.tensorflow_worker import TensorFlowWorker from smartsim.settings import DragonRunSettings from smartsim.status import TERMINAL_STATUSES parser = argparse.ArgumentParser("Mock application") parser.add_argument("--log_max_batchsize", default=8, type=int) parser.add_argument("--num_nodes_app", default=1, type=int) +parser.add_argument("--toolkit", default="torch", choices=["torch","tensorflow"], type=str) args = parser.parse_args() DEVICE = "gpu" -NUM_RANKS_PER_NODE = 16 +NUM_RANKS_PER_NODE = 1 NUM_NODES_APP = args.num_nodes_app NUM_WORKERS = 1 BATCH_SIZE = 2 BATCH_TIMEOUT = 0.0 filedir = os.path.dirname(__file__) worker_manager_script_name = os.path.join(filedir, "standalone_worker_manager.py") -app_script_name = os.path.join(filedir, "mock_app.py") +if args.toolkit == "torch": + # keeping old name for backward compatibility + app_script_name = os.path.join(filedir, "mock_app.py") +else: + app_script_name = os.path.join(filedir, f"mock_app_{args.toolkit}.py") model_name = os.path.join(filedir, f"resnet50.{DEVICE}.pt") transport: t.Literal["hsta", "tcp"] = "hsta" @@ -36,6 +42,7 @@ exp_path = os.path.join( filedir, "benchmark", + args.toolkit, f"throughput_n{NUM_NODES_APP}_rpn{NUM_RANKS_PER_NODE}_timeout{BATCH_TIMEOUT}", f"samples{2**args.log_max_batchsize}", ) @@ -47,7 +54,11 @@ os.makedirs(exp_path, exist_ok=True) exp = Experiment("MLI_benchmark", launcher="dragon", exp_path=exp_path) -torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii") +if args.toolkit == "torch": + worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii") +elif args.toolkit == "tensorflow": + worker_str = base64.b64encode(cloudpickle.dumps(TensorFlowWorker)).decode("ascii") + worker_manager_rs: DragonRunSettings = exp.create_run_settings( sys.executable, @@ -56,7 +67,7 @@ "--device", DEVICE, "--worker_class", - torch_worker_str, + worker_str, "--batch_size", str(BATCH_SIZE), "--batch_timeout", @@ -70,13 +81,19 @@ worker_manager_rs.set_cpu_affinity(aff) worker_manager_rs.set_gpu_affinity([0, 1, 2, 3]) -worker_manager_rs.set_hostlist(["pinoak0037"]) +worker_manager_rs.set_hostlist(["pinoak0043"]) worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) app_rs: DragonRunSettings = exp.create_run_settings( sys.executable, - exe_args=[app_script_name, "--device", DEVICE, "--log_max_batchsize", str(args.log_max_batchsize)], + exe_args=[ + app_script_name, + "--device", + DEVICE, + "--log_max_batchsize", + str(args.log_max_batchsize), + ], ) app_rs.set_tasks_per_node(NUM_RANKS_PER_NODE) app_rs.set_nodes(NUM_NODES_APP) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index aa7cfcce80..d6977e5a7b 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -37,21 +37,22 @@ import argparse import io -import numpy import os import time -import torch import typing as t - import warnings +import numpy +import torch from mpi4py import MPI + from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel, create_local from smartsim._core.mli.message_handler import MessageHandler -from smartsim.log import get_logger from smartsim._core.utils.timings import PerfTimer +from smartsim.log import get_logger torch.set_num_interop_threads(16) torch.set_num_threads(1) @@ -60,6 +61,7 @@ warnings.filterwarnings("ignore", "\*The given NumPy array is not writable\*") + class ProtoClient: def __init__(self, timing_on: bool): self._comm = MPI.COMM_WORLD @@ -75,11 +77,13 @@ def __init__(self, timing_on: bool): self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str) except KeyError: time.sleep(1) - self._from_worker_ch = Channel.make_process_local() - self._from_worker_ch_serialized = self._from_worker_ch.serialize() + self._from_worker_ch = DragonCommChannel(create_local()) + self._from_worker_ch_serialized = self._from_worker_ch.descriptor_string self._to_worker_ch = Channel.make_process_local() - self.perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{self._rank}_") + self.perf_timer: PerfTimer = PerfTimer( + debug=True, timing_on=timing_on, prefix=f"a{self._rank}_" + ) self._num_its: int = 0 def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor): @@ -104,28 +108,33 @@ def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor): self.perf_timer.measure_time("build_request") request_bytes = MessageHandler.serialize_request(request) self.perf_timer.measure_time("serialize_request") - with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: + with self._to_worker_fli.sendh( + timeout=None, stream_channel=self._to_worker_ch + ) as to_sendh: to_sendh.send_bytes(request_bytes) self.perf_timer.measure_time("send_request") for tensor in tensors: - to_sendh.send_bytes(tensor.tobytes()) #TODO NOT FAST ENOUGH!!! + to_sendh.send_bytes(tensor.tobytes()) # TODO NOT FAST ENOUGH!!! # logger.info(f"{self._rank} sent tensors") self.perf_timer.measure_time("send_tensors") - with self._from_worker_ch.recvh(timeout=None) as from_recvh: - resp = from_recvh.recv_bytes(timeout=None) - self.perf_timer.measure_time("receive_response") - response = MessageHandler.deserialize_response(resp) - self.perf_timer.measure_time("deserialize_response") - # list of data blobs? recv depending on the len(response.result.descriptors)? - data_blob: bytes = from_recvh.recv_bytes(timeout=None) - self.perf_timer.measure_time("receive_tensor") - result = torch.from_numpy( - numpy.frombuffer( - data_blob, - dtype=str(response.result.descriptors[0].dataType), - ) + + resp = self._from_worker_ch.recv(timeout=None) + self.perf_timer.measure_time("receive_response") + response = MessageHandler.deserialize_response(resp[0]) + self.perf_timer.measure_time("deserialize_response") + # list of data blobs? recv depending on the len(response.result.descriptors)? + if len(resp) > 1: + data_blob = resp[1] + else: + data_blob: bytes = self._from_worker_ch.recv(timeout=None)[0] + self.perf_timer.measure_time("receive_tensor") + result = torch.from_numpy( + numpy.frombuffer( + data_blob, + dtype=str(response.result.descriptors[0].dataType), ) - self.perf_timer.measure_time("deserialize_tensor") + ) + self.perf_timer.measure_time("deserialize_tensor") self.perf_timer.end_timings() self._num_its += 1 @@ -161,10 +170,12 @@ def model(self): def name(self): return self._name + def log(msg: str, rank: int) -> None: if rank == 0: logger.info(msg) + if __name__ == "__main__": parser = argparse.ArgumentParser("Mock application") @@ -182,13 +193,16 @@ def log(msg: str, rank: int) -> None: TOTAL_ITERATIONS = 10 - for log2_bsize in range(args.log_max_batchsize, args.log_max_batchsize+1): + for log2_bsize in range(args.log_max_batchsize, args.log_max_batchsize + 1): b_size: int = 2**log2_bsize log(f"Batch size: {b_size}", client._rank) for iteration_number in range(TOTAL_ITERATIONS): # log(f"Iteration: {iteration_number}", client._rank) sample_batch = resnet.get_batch(b_size) remote_result = client.run_model(resnet.name, sample_batch) - log(f"Completed iteration: {iteration_number} in {client.perf_timer.get_last('total_time')} seconds", client._rank) + log( + f"Completed iteration: {iteration_number} in {client.perf_timer.get_last('total_time')} seconds", + client._rank, + ) - client.perf_timer.print_timings(to_file=True, to_stdout=client._rank==0) \ No newline at end of file + client.perf_timer.print_timings(to_file=True, to_stdout=client._rank == 0) diff --git a/ex/high_throughput_inference/mock_app_redis.py b/ex/high_throughput_inference/mock_app_redis.py index 9cdb336be2..d2a3d4a05a 100644 --- a/ex/high_throughput_inference/mock_app_redis.py +++ b/ex/high_throughput_inference/mock_app_redis.py @@ -26,17 +26,20 @@ import argparse import io -import numpy import time + +import numpy import torch from mpi4py import MPI -from smartsim.log import get_logger -from smartsim._core.utils.timings import PerfTimer from smartredis import Client +from smartsim._core.utils.timings import PerfTimer +from smartsim.log import get_logger + logger = get_logger("App") -class ResNetWrapper(): + +class ResNetWrapper: def __init__(self, name: str, model: str): self._model = None self._name = name @@ -44,7 +47,7 @@ def __init__(self, name: str, model: str): buffer = io.BytesIO(model_file.read()) self._serialized_model = buffer.getvalue() - def get_batch(self, batch_size: int=32): + def get_batch(self, batch_size: int = 32): return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) @property @@ -60,6 +63,7 @@ def log(msg: str, rank: int) -> None: if rank == 0: logger.info(msg) + if __name__ == "__main__": comm = MPI.COMM_WORLD @@ -75,30 +79,40 @@ def log(msg: str, rank: int) -> None: client = Client(cluster=False, address=None) if rank == 0: - client.set_model(resnet.name, resnet.model, backend='TORCH', device=args.device.upper()) + client.set_model( + resnet.name, resnet.model, backend="TORCH", device=args.device.upper() + ) comm.Barrier() - perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=True, prefix=f"redis{rank}_") + perf_timer: PerfTimer = PerfTimer( + debug=False, timing_on=True, prefix=f"redis{rank}_" + ) total_iterations = 100 - timings=[] - for log2_bsize in range(args.log_max_batchsize, args.log_max_batchsize+1): + timings = [] + for log2_bsize in range(args.log_max_batchsize, args.log_max_batchsize + 1): batch_size: int = 2**log2_bsize log(f"Batch size: {batch_size}", rank) for iteration_number in range(total_iterations): perf_timer.start_timings("batch_size", batch_size) input_name = f"batch_{rank}" output_name = f"result_{rank}" - client.put_tensor(name=input_name, data=resnet.get_batch(batch_size).numpy()) + client.put_tensor( + name=input_name, data=resnet.get_batch(batch_size).numpy() + ) perf_timer.measure_time("send_request") - client.run_model(name=resnet.name, inputs=[input_name], outputs=[output_name]) + client.run_model( + name=resnet.name, inputs=[input_name], outputs=[output_name] + ) perf_timer.measure_time("run_model") result = client.get_tensor(name=output_name) perf_timer.measure_time("receive_response") perf_timer.end_timings() comm.Barrier() - log(f"Completed iteration: {iteration_number} in {perf_timer.get_last('total_time')} seconds", rank) - + log( + f"Completed iteration: {iteration_number} in {perf_timer.get_last('total_time')} seconds", + rank, + ) - perf_timer.print_timings(True, to_stdout=rank==0) + perf_timer.print_timings(True, to_stdout=rank == 0) diff --git a/ex/high_throughput_inference/mock_app_tensorflow.py b/ex/high_throughput_inference/mock_app_tensorflow.py new file mode 100644 index 0000000000..557f3c7559 --- /dev/null +++ b/ex/high_throughput_inference/mock_app_tensorflow.py @@ -0,0 +1,206 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# isort: off +import dragon +from dragon import fli +from dragon.channels import Channel +import dragon.channels +from dragon.data.ddict.ddict import DDict +from dragon.globalservices.api_setup import connect_to_infrastructure +from dragon.utils import b64decode, b64encode + +# isort: on + +import argparse +import io +import os +import time +import typing as t +import warnings + +import numpy +import tensorflow as tf +from mpi4py import MPI +from tensorflow.python.framework.convert_to_constants import ( + convert_variables_to_constants_v2_as_graph, +) + +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( + DragonFeatureStore, +) +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel, create_local +from smartsim._core.mli.message_handler import MessageHandler +from smartsim._core.utils.timings import PerfTimer +from smartsim.log import get_logger + +logger = get_logger("App") + + +class ProtoClient: + def __init__(self, timing_on: bool): + self._comm = MPI.COMM_WORLD + self._rank = self._comm.Get_rank() + connect_to_infrastructure() + ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] + self._ddict = DDict.attach(ddict_str) + self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor + to_worker_fli_str = None + while to_worker_fli_str is None: + try: + to_worker_fli_str = self._ddict["to_worker_fli"] + self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str) + except KeyError: + time.sleep(1) + self._from_worker_ch = DragonCommChannel(create_local()) + self._from_worker_ch_serialized = self._from_worker_ch.descriptor_string + self._to_worker_ch = Channel.make_process_local() + + self.perf_timer: PerfTimer = PerfTimer( + debug=False, timing_on=timing_on, prefix=f"a{self._rank}_" + ) + self._num_its: int = 0 + + def run_model(self, model: t.Union[bytes, str], batch: numpy.typing.ArrayLike): + tensors = [batch] + self.perf_timer.start_timings("batch_size", batch.shape[0]) + built_tensor_desc = MessageHandler.build_tensor_descriptor( + "c", "float32", list(batch.shape) + ) + self.perf_timer.measure_time("build_tensor_descriptor") + if isinstance(model, str): + model_arg = MessageHandler.build_model_key(model, self._backbone_descriptor) + else: + model_arg = MessageHandler.build_model(model, "resnet-50", "1.0") + request = MessageHandler.build_request( + reply_channel=self._from_worker_ch_serialized, + model=model_arg, + inputs=[built_tensor_desc], + outputs=[], + output_descriptors=[], + custom_attributes=None, + ) + self.perf_timer.measure_time("build_request") + request_bytes = MessageHandler.serialize_request(request) + self.perf_timer.measure_time("serialize_request") + with self._to_worker_fli.sendh( + timeout=None, stream_channel=self._to_worker_ch + ) as to_sendh: + to_sendh.send_bytes(request_bytes) + self.perf_timer.measure_time("send_request") + for tensor in tensors: + to_sendh.send_bytes(tensor.tobytes()) + self.perf_timer.measure_time("send_tensors") + resp = self._from_worker_ch.recv(timeout=None) + self.perf_timer.measure_time("receive_response") + response = MessageHandler.deserialize_response(resp[0]) + self.perf_timer.measure_time("deserialize_response") + # list of data blobs? recv depending on the len(response.result.descriptors)? + if len(resp) > 1: + data_blob = resp[1] + else: + data_blob: bytes = self._from_worker_ch.recv(timeout=None)[0] + self.perf_timer.measure_time("receive_tensor") + result = numpy.frombuffer( + data_blob, + dtype=str(response.result.descriptors[0].dataType), + ) + + self.perf_timer.measure_time("deserialize_tensor") + + self.perf_timer.end_timings() + self._num_its += 1 + self._comm.Barrier() + return result + + def set_model(self, key: str, model: bytes): + self._ddict[key] = model + + +class ResNetWrapper: + def __init__( + self, + name: str, + model: tf.keras.Model, + ): + self._get_tf_model(model) + self._name = name + + def _get_tf_model(self, model: tf.keras.Model): + real_model = tf.function(model).get_concrete_function( + tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype) + ) + _, graph_def = convert_variables_to_constants_v2_as_graph(real_model) + self._serialized_model = graph_def.SerializeToString() + + def get_batch(self, batch_size: int = 32): + return numpy.random.randn(batch_size, 224, 224, 3).astype(numpy.float32) + + @property + def model(self): + return self._serialized_model + + @property + def name(self): + return self._name + + +def log(msg: str, rank: int) -> None: + if rank == 0: + logger.info(msg) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser("Mock application") + parser.add_argument("--device", default="cpu", type=str) + parser.add_argument("--log_max_batchsize", default=8, type=int) + args = parser.parse_args() + + resnet = ResNetWrapper("resnet50", tf.keras.applications.ResNet50()) + + client = ProtoClient(timing_on=True) + + if client._rank == 0: + client.set_model(resnet.name, resnet.model) + + MPI.COMM_WORLD.Barrier() + + TOTAL_ITERATIONS = 10 + + for log2_bsize in range(args.log_max_batchsize, args.log_max_batchsize + 1): + b_size: int = 2**log2_bsize + log(f"Batch size: {b_size}", client._rank) + for iteration_number in range(TOTAL_ITERATIONS): + sample_batch = resnet.get_batch(b_size) + log(f"Batch size {sample_batch.shape}", client._rank) + remote_result = client.run_model(resnet.name, sample_batch) + log( + f"Completed iteration: {iteration_number} in {client.perf_timer.get_last('total_time')} seconds", + client._rank, + ) + + client.perf_timer.print_timings(to_file=True, to_stdout=client._rank == 0) diff --git a/ex/high_throughput_inference/redis_driver.py b/ex/high_throughput_inference/redis_driver.py index 0774b88f98..b3b2424723 100644 --- a/ex/high_throughput_inference/redis_driver.py +++ b/ex/high_throughput_inference/redis_driver.py @@ -64,7 +64,14 @@ db = exp.create_database(interface="hsn0", hosts=["pinoak0036"]) app_rs = exp.create_run_settings( - sys.executable, exe_args=[app_script_name, "--device", DEVICE, "--log_max_batchsize", str(args.log_max_batchsize)] + sys.executable, + exe_args=[ + app_script_name, + "--device", + DEVICE, + "--log_max_batchsize", + str(args.log_max_batchsize), + ], ) app_rs.set_nodes(NUM_NODES) app_rs.set_tasks(NUM_NODES * NUM_TASKS_PER_NODE) diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py index 92697282df..f92f199d2e 100644 --- a/ex/high_throughput_inference/standalone_worker_manager.py +++ b/ex/high_throughput_inference/standalone_worker_manager.py @@ -37,6 +37,7 @@ from dragon.globalservices.api_setup import connect_to_infrastructure from dragon.managed_memory import MemoryPool from dragon.utils import b64decode, b64encode + # pylint enable=import-error # isort: off @@ -45,6 +46,7 @@ import argparse import base64 import multiprocessing as mp +import optparse import os import pickle import socket @@ -53,16 +55,11 @@ import typing as t import cloudpickle -import optparse -import os from smartsim._core.entrypoints.service import Service from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( - DragonFeatureStore, -) from smartsim._core.mli.infrastructure.control.request_dispatcher import ( RequestDispatcher, ) @@ -72,7 +69,6 @@ DragonFeatureStore, ) from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase - from smartsim.log import get_logger logger = get_logger("Worker Manager Entry Point") @@ -85,7 +81,6 @@ logger.info(f"CPUS: {os.cpu_count()}") - def service_as_dragon_proc( service: Service, cpu_affinity: list[int], gpu_affinity: list[int] ) -> dragon_process.Process: @@ -108,8 +103,6 @@ def service_as_dragon_proc( ) - - if __name__ == "__main__": parser = argparse.ArgumentParser("Worker Manager") parser.add_argument( @@ -163,7 +156,7 @@ def service_as_dragon_proc( config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=DragonCommChannel, + callback_factory=DragonCommChannel.from_descriptor, queue_factory=DragonFLIChannel.from_descriptor, ) @@ -172,14 +165,14 @@ def service_as_dragon_proc( batch_size=args.batch_size, config_loader=config_loader, worker_type=arg_worker_type, - mem_pool_size=128*1024**3, + mem_pool_size=128 * 1024**3, ) wms = [] worker_device = args.device for wm_idx in range(args.num_workers): - worker_manager = WorkerManager( + worker_manager = WorkerManager( config_loader=config_loader, worker_type=arg_worker_type, as_service=True, @@ -197,21 +190,25 @@ def service_as_dragon_proc( # the GPU-to-CPU mapping is taken from the nvidia-smi tool # TODO can this be computed on the fly? gpu_to_cpu_aff: dict[int, list[int]] = {} - gpu_to_cpu_aff[0] = list(range(48,64)) + list(range(112,128)) - gpu_to_cpu_aff[1] = list(range(32,48)) + list(range(96,112)) - gpu_to_cpu_aff[2] = list(range(16,32)) + list(range(80,96)) - gpu_to_cpu_aff[3] = list(range(0,16)) + list(range(64,80)) + gpu_to_cpu_aff[0] = list(range(48, 64)) + list(range(112, 128)) + gpu_to_cpu_aff[1] = list(range(32, 48)) + list(range(96, 112)) + gpu_to_cpu_aff[2] = list(range(16, 32)) + list(range(80, 96)) + gpu_to_cpu_aff[3] = list(range(0, 16)) + list(range(64, 80)) worker_manager_procs = [] for worker_idx in range(args.num_workers): wm_cpus = len(gpu_to_cpu_aff[worker_idx]) - 4 wm_affinity = gpu_to_cpu_aff[worker_idx][:wm_cpus] disp_affinity.extend(gpu_to_cpu_aff[worker_idx][wm_cpus:]) - worker_manager_procs.append(service_as_dragon_proc( + worker_manager_procs.append( + service_as_dragon_proc( worker_manager, cpu_affinity=wm_affinity, gpu_affinity=[worker_idx] - )) + ) + ) - dispatcher_proc = service_as_dragon_proc(dispatcher, cpu_affinity=disp_affinity, gpu_affinity=[]) + dispatcher_proc = service_as_dragon_proc( + dispatcher, cpu_affinity=disp_affinity, gpu_affinity=[] + ) # TODO: use ProcessGroup and restart=True? all_procs = [dispatcher_proc, *worker_manager_procs] @@ -223,4 +220,4 @@ def service_as_dragon_proc( time.sleep(1) for proc in all_procs: - logger.info(f"{proc} is alive: {proc.is_alive}") \ No newline at end of file + logger.info(f"{proc} is alive: {proc.is_alive}") diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py index 6b4ef74b67..f043a4a230 100644 --- a/smartsim/_core/entrypoints/service.py +++ b/smartsim/_core/entrypoints/service.py @@ -90,7 +90,8 @@ def _log_cooldown(self, elapsed: float) -> None: """Log the remaining cooldown time, if any""" remaining = self._cooldown - elapsed if remaining > 0: - logger.debug(f"{abs(remaining):.2f}s remains of {self._cooldown}s cooldown") + pass + # logger.debug(f"{abs(remaining):.2f}s remains of {self._cooldown}s cooldown") else: logger.info(f"exceeded cooldown {self._cooldown}s by {abs(remaining):.2f}s") diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index e44e305db4..b199df4e3e 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -951,4 +951,4 @@ def _host_table_line(host: str) -> list[str]: return tabulate( values, headers, disable_numparse=True, tablefmt="github", colalign=colalign - ) \ No newline at end of file + ) diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index 1363c0d675..f6893220eb 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -81,6 +81,7 @@ def create_local(capacity: int = 0) -> dch.Channel: logger.error(f"All attempts to attach local channel have failed") raise SmartSimError("Failed to attach local channel") from e + logger.debug(f"Attached to channel id {cid}") return channel @@ -131,7 +132,7 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]: messages: t.List[bytes] = [] try: - message_bytes = recvh.recv_bytes(timeout=timeout) + message_bytes = recvh.recv_bytes(timeout=None) messages.append(message_bytes) logger.debug(f"DragonCommChannel {self.descriptor!r} received message") except dch.ChannelEmpty: @@ -178,11 +179,11 @@ def from_descriptor( # todo: ensure the bytes argument and condition are removed # after refactoring the RPC models - actual_descriptor = base64.b64decode(utf8_descriptor) channel = dch.Channel.attach(actual_descriptor) return DragonCommChannel(channel) except Exception as ex: + logger.debug(f"Failed to create dragon comm channel: {descriptor!r}, {ex}") raise SmartSimError( f"Failed to create dragon comm channel: {descriptor!r}" ) from ex diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index 84d809c8ac..5f82b82abe 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -94,7 +94,7 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]: with self._fli.recvh(timeout=timeout) as recvh: while not eot: try: - message, _ = recvh.recv_bytes(timeout=timeout) + message, _ = recvh.recv_bytes(timeout=None) messages.append(message) logger.debug( f"DragonFLIChannel {self.descriptor!r} received message" diff --git a/smartsim/_core/mli/infrastructure/control/device_manager.py b/smartsim/_core/mli/infrastructure/control/device_manager.py index 9334971f8c..8e6ec28743 100644 --- a/smartsim/_core/mli/infrastructure/control/device_manager.py +++ b/smartsim/_core/mli/infrastructure/control/device_manager.py @@ -29,7 +29,7 @@ from .....log import get_logger from ..storage.feature_store import FeatureStore -from ..worker.worker import MachineLearningWorkerBase, RequestBatch +from ..worker.worker import LoadModelResult, MachineLearningWorkerBase, RequestBatch logger = get_logger(__name__) @@ -42,7 +42,7 @@ def __init__(self, name: str) -> None: """ self._name = name """The name used by the toolkit to identify this device""" - self._models: dict[str, t.Any] = {} + self._models: dict[str, LoadModelResult] = {} """Dict of keys to models which are loaded on this device""" @property @@ -53,7 +53,7 @@ def name(self) -> str: """ return self._name - def add_model(self, key: str, model: t.Any) -> None: + def add_model(self, key: str, model: LoadModelResult) -> None: """Add a reference to a model loaded on this device and assign it a key. :param key: The key under which the model is saved @@ -73,7 +73,7 @@ def remove_model(self, key: str) -> None: logger.warning(f"An unknown key was requested for removal: {key}") raise - def get_model(self, key: str) -> t.Any: + def get_model(self, key: str) -> LoadModelResult: """Get the model corresponding to a given key. :param key: The model key @@ -136,7 +136,7 @@ def _load_model_on_device( model_bytes = worker.fetch_model(batch, feature_stores) loaded_model = worker.load_model(batch, model_bytes, self._device.name) - self._device.add_model(batch.model_id.key, loaded_model.model) + self._device.add_model(batch.model_id.key, loaded_model) def get_device( self, diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index 78f56b1ff3..cdf651bfef 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -145,7 +145,7 @@ def ready(self) -> bool: return False timed_out = ( - self._batch_timeout > 0 and self._elapsed_time >= self._batch_timeout + self._batch_timeout >= 0 and self._elapsed_time >= self._batch_timeout ) logger.debug(f"Is full: {self.full()} or has timed out: {timed_out}") return self.full() or timed_out @@ -250,7 +250,7 @@ def __init__( """The worker used to batch inputs""" self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(mem_pool_size).sdesc) """Memory pool used to share batched input tensors with the Worker Managers""" - self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True) + self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True) """Performance timer""" @property @@ -351,7 +351,6 @@ def _on_iteration(self) -> None: """This method is executed repeatedly until ``Service`` shutdown conditions are satisfied and cooldown is elapsed.""" try: - logger.debug("Receiving message") self._perf_timer.is_active = True bytes_list: t.List[bytes] = self._incoming_channel.recv() logger.debug("Received data") diff --git a/smartsim/_core/mli/infrastructure/control/worker_manager.py b/smartsim/_core/mli/infrastructure/control/worker_manager.py index f2cc35caf8..c2d9b53fba 100644 --- a/smartsim/_core/mli/infrastructure/control/worker_manager.py +++ b/smartsim/_core/mli/infrastructure/control/worker_manager.py @@ -106,7 +106,7 @@ def __init__( information among MLI components""" self._device_manager: t.Optional[DeviceManager] = None """Object responsible for model caching and device access""" - self._perf_timer = PerfTimer(prefix="w_", debug=True, timing_on=True) + self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) """Performance timer""" self._processed_batches: int = 0 """Number of processed request batches""" @@ -238,7 +238,7 @@ def _on_iteration(self) -> None: with device_cm as device: try: - model_result = LoadModelResult(device.get_model(batch.model_id.key)) + model_result = device.get_model(batch.model_id.key) except Exception as exc: for request in batch.requests: exception_handler( diff --git a/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py b/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py new file mode 100644 index 0000000000..09da58afc3 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py @@ -0,0 +1,308 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io + +import logging +import numpy as np +import numpy.typing as npt +import os +import tensorflow as tf + +# pylint: disable=import-error +from dragon.managed_memory import MemoryAlloc, MemoryPool +from tensorflow.python.framework.convert_to_constants import ( + convert_var_to_const_function_in_v1, +) +from tensorflow.python.framework.ops import disable_eager_execution + +tf.get_logger().setLevel(logging.ERROR) + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +from .....error import SmartSimError +from .....log import get_logger +from ...mli_schemas.tensor import tensor_capnp +from .worker import ( + ExecuteResult, + FetchInputResult, + FetchModelResult, + LoadModelResult, + MachineLearningWorkerBase, + RequestBatch, + TransformInputResult, + TransformOutputResult, +) + +# pylint: enable=import-error + +logger = get_logger(__name__) + +disable_eager_execution() + + +class TensorFlowWorker(MachineLearningWorkerBase): + """A worker that executes a TensorFlow model.""" + + @staticmethod + def load_model( + batch: RequestBatch, fetch_result: FetchModelResult, device: str + ) -> LoadModelResult: + """Given a loaded MachineLearningModel, ensure it is loaded into + device memory. + + :param request: The request that triggered the pipeline + :param device: The device on which the model must be placed + :returns: LoadModelResult wrapping the model loaded for the request + :raises ValueError: If model reference object is not found + :raises RuntimeError: If loading and evaluating the model failed + """ + if fetch_result.model_bytes: + model_bytes = fetch_result.model_bytes + elif batch.raw_model and batch.raw_model.data: + model_bytes = batch.raw_model.data + else: + raise ValueError("Unable to load model without reference object") + + try: + graph_def = tf.compat.v1.GraphDef() + graph_def.ParseFromString(model_bytes) + + with tf.Graph().as_default() as graph, tf.device(device): + tf.import_graph_def(graph_def, name="") + + ops = graph.get_operations() + input_layers = [] + for op in ops: + if op.type == "Placeholder": + logger.debug( + "Input op name: {}, output shape : {}".format( + op.name, op.outputs[0].get_shape() + ) + ) + input_layers.append(f"{op.name}:0") + + output_tensors = set() + input_tensors = set() + for op in ops: + for x in op.inputs: + if x.name not in input_tensors: + input_tensors.add(x.name) + for op in ops: + if len(op.outputs) > 0: + x = op.outputs[0] + if x.name not in input_tensors: + logger.debug( + "Output tensor name: {}, tensor shape : {}, parent op type: {}".format( + x.name, x.get_shape(), op.type + ) + ) + output_tensors.add(x.name) + + except Exception as e: + raise RuntimeError( + "Failed to load and evaluate the model: " + f"Model key {batch.model_id.key}, Device {device}" + ) from e + result = LoadModelResult(graph, input_layers, list(output_tensors)) + return result + + @staticmethod + def transform_input( + batch: RequestBatch, + fetch_results: list[FetchInputResult], + mem_pool: MemoryPool, + ) -> TransformInputResult: + """Given a collection of data, perform a transformation on the data and put + the raw tensor data on a MemoryPool allocation. + + :param request: The request that triggered the pipeline + :param fetch_result: Raw outputs from fetching inputs out of a feature store + :param mem_pool: The memory pool used to access batched input tensors + :returns: The transformed inputs wrapped in a TransformInputResult + :raises ValueError: If tensors cannot be reconstructed + :raises IndexError: If index out of range + """ + results: list[memoryview] = [] + total_samples = 0 + slices: list[slice] = [] + + all_dims: list[list[int]] = [] + all_dtypes: list[str] = [] + if fetch_results[0].meta is None: + raise ValueError("Cannot reconstruct tensor without meta information") + # Traverse inputs to get total number of samples and compute slices + # Assumption: first dimension is samples, all tensors in the same input + # have same number of samples + # thus we only look at the first tensor for each input + for res_idx, fetch_result in enumerate(fetch_results): + if fetch_result.meta is None or any( + item_meta is None for item_meta in fetch_result.meta + ): + raise ValueError("Cannot reconstruct tensor without meta information") + first_tensor_desc: tensor_capnp.TensorDescriptor = fetch_result.meta[0] + num_samples = first_tensor_desc.dimensions[0] + slices.append(slice(total_samples, total_samples + num_samples)) + total_samples = total_samples + num_samples + + if res_idx == len(fetch_results) - 1: + # For each tensor in the last input, get remaining dimensions + # Assumptions: all inputs have the same number of tensors and + # last N-1 dimensions match across inputs for corresponding tensors + # thus: resulting array will be of size (num_samples, all_other_dims) + for item_meta in fetch_result.meta: + tensor_desc: tensor_capnp.TensorDescriptor = item_meta + tensor_dims = list(tensor_desc.dimensions) + all_dims.append([total_samples, *tensor_dims[1:]]) + all_dtypes.append(str(tensor_desc.dataType)) + + for result_tensor_idx, (dims, dtype) in enumerate(zip(all_dims, all_dtypes)): + itemsize = np.empty((1), dtype=dtype).itemsize + alloc_size = int(np.prod(dims) * itemsize) + mem_alloc = mem_pool.alloc(alloc_size) + mem_view = mem_alloc.get_memview() + try: + joined = b"".join( + [ + fetch_result.inputs[result_tensor_idx] + for fetch_result in fetch_results + ] + ) + mem_view[:alloc_size] = joined + except IndexError as e: + raise IndexError( + "Error accessing elements in fetch_result.inputs " + f"with index {result_tensor_idx}" + ) from e + + results.append(mem_alloc.serialize()) + + return TransformInputResult(results, slices, all_dims, all_dtypes) + + # pylint: disable-next=unused-argument + @staticmethod + def execute( + batch: RequestBatch, + load_result: LoadModelResult, + transform_result: TransformInputResult, + device: str, + ) -> ExecuteResult: + """Execute an ML model on inputs transformed for use by the model. + + :param batch: The batch of requests that triggered the pipeline + :param load_result: The result of loading the model onto device memory + :param transform_result: The result of transforming inputs for model consumption + :param device: The device on which the model will be executed + :returns: The result of inference wrapped in an ExecuteResult + :raises SmartSimError: If model is not loaded + :raises IndexError: If memory slicing is out of range + :raises ValueError: If tensor creation fails or is unable to evaluate the model + """ + if not load_result.model: + raise SmartSimError("Model must be loaded to execute") + device_to_tf = {"cpu": "/CPU", "gpu": "/GPU"} + for old, new in device_to_tf.items(): + device = device.replace(old, new) + + tensors = [] + mem_allocs = [] + for transformed, dims, dtype in zip( + transform_result.transformed, transform_result.dims, transform_result.dtypes + ): + mem_alloc = MemoryAlloc.attach(transformed) + mem_allocs.append(mem_alloc) + itemsize = np.empty((1), dtype=dtype).itemsize + try: + tensors.append( + np.frombuffer( + mem_alloc.get_memview()[0 : np.prod(dims) * itemsize], + dtype=dtype, + ).reshape(dims) + ) + except IndexError as e: + raise IndexError("Error during memory slicing") from e + except Exception as e: + raise ValueError("Error during tensor creation") from e + + model_graph: tf.Graph = load_result.model + try: + with tf.compat.v1.Session(graph=model_graph) as sess, tf.device(device): + results = sess.run( + load_result.outputs, + feed_dict={ + input_layer: tensor + for input_layer, tensor in zip(load_result.inputs, tensors) + }, + ) + except Exception as e: + raise ValueError( + f"Error while evaluating the model: Model {batch.model_id.key}" + ) from e + + transform_result.transformed = [] + + execute_result = ExecuteResult(results, transform_result.slices) + for mem_alloc in mem_allocs: + mem_alloc.free() + return execute_result + + @staticmethod + def transform_output( + batch: RequestBatch, + execute_result: ExecuteResult, + ) -> list[TransformOutputResult]: + """Given inference results, perform transformations required to + transmit results to the requestor. + + :param batch: The batch of requests that triggered the pipeline + :param execute_result: The result of inference wrapped in an ExecuteResult + :returns: A list of transformed outputs + :raises IndexError: If indexing is out of range + :raises ValueError: If transforming output fails + """ + transformed_list: list[TransformOutputResult] = [] + cpu_predictions = execute_result.predictions + + for result_slice in execute_result.slices: + transformed = [] + for cpu_item in cpu_predictions: + try: + transformed.append(cpu_item[result_slice].tobytes()) + + # todo: need the shape from latest schemas added here. + transformed_list.append( + TransformOutputResult(transformed, None, "c", "float32") + ) # fixme + except IndexError as e: + raise IndexError( + f"Error accessing elements: result_slice {result_slice}" + ) from e + except Exception as e: + raise ValueError("Error transforming output") from e + + execute_result.predictions = [] + + return transformed_list diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 8bcf5c5f58..309cf8675d 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -401,7 +401,6 @@ def deserialize_message( model_key: t.Optional[FeatureStoreKey] = None model_bytes: t.Optional[Model] = None - logger.debug("Getting key") if request.model.which() == "key": model_key = FeatureStoreKey( key=request.model.key.key, @@ -410,34 +409,27 @@ def deserialize_message( elif request.model.which() == "data": model_bytes = request.model.data - logger.debug("Getting descriptor") callback_key = request.replyChannel.descriptor - logger.debug(f"Callback factory {callback_factory}({callback_key})") comm_channel = callback_factory(callback_key) input_keys: t.Optional[t.List[FeatureStoreKey]] = None input_bytes: t.Optional[t.List[bytes]] = None output_keys: t.Optional[t.List[FeatureStoreKey]] = None input_meta: t.Optional[t.List[TensorDescriptor]] = None - logger.debug("Does it have a key?") if request.input.which() == "keys": - logger.debug("Yeah") input_keys = [ FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor) for value in request.input.keys ] elif request.input.which() == "descriptors": - logger.debug("Not, but it has descriptors") input_meta = request.input.descriptors # type: ignore - logger.debug("Does it have an output?") if request.output: output_keys = [ FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor) for value in request.output ] - logger.debug("Going to build the request, then") inference_request = InferenceRequest( model_key=model_key, callback=comm_channel, From 3a5a8ce1b63bc5304f6a41039aa9209e556531ee Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 25 Sep 2024 18:59:08 -0500 Subject: [PATCH 35/60] Improve TF Worker throughput --- .../_core/mli/infrastructure/worker/tensorflow_worker.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py b/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py index 09da58afc3..e89019a3c1 100644 --- a/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py @@ -127,7 +127,8 @@ def load_model( "Failed to load and evaluate the model: " f"Model key {batch.model_id.key}, Device {device}" ) from e - result = LoadModelResult(graph, input_layers, list(output_tensors)) + with tf.device(device): + result = LoadModelResult(tf.compat.v1.Session(graph=graph), input_layers, list(output_tensors)) return result @staticmethod @@ -247,9 +248,9 @@ def execute( except Exception as e: raise ValueError("Error during tensor creation") from e - model_graph: tf.Graph = load_result.model + sess = load_result.model try: - with tf.compat.v1.Session(graph=model_graph) as sess, tf.device(device): + with tf.device(device): results = sess.run( load_result.outputs, feed_dict={ From 48caac247c135c2e8dc17ecf8d933a2533b9b9fb Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 25 Sep 2024 19:03:04 -0500 Subject: [PATCH 36/60] Revert changes to throughput scripts --- ex/high_throughput_inference/mli_driver.py | 2 +- ex/high_throughput_inference/mock_app_tensorflow.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index fa56f6ca77..c66bda30a6 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -21,7 +21,7 @@ args = parser.parse_args() DEVICE = "gpu" -NUM_RANKS_PER_NODE = 1 +NUM_RANKS_PER_NODE = 16 NUM_NODES_APP = args.num_nodes_app NUM_WORKERS = 1 BATCH_SIZE = 2 diff --git a/ex/high_throughput_inference/mock_app_tensorflow.py b/ex/high_throughput_inference/mock_app_tensorflow.py index 557f3c7559..d70991980f 100644 --- a/ex/high_throughput_inference/mock_app_tensorflow.py +++ b/ex/high_throughput_inference/mock_app_tensorflow.py @@ -196,7 +196,6 @@ def log(msg: str, rank: int) -> None: log(f"Batch size: {b_size}", client._rank) for iteration_number in range(TOTAL_ITERATIONS): sample_batch = resnet.get_batch(b_size) - log(f"Batch size {sample_batch.shape}", client._rank) remote_result = client.run_model(resnet.name, sample_batch) log( f"Completed iteration: {iteration_number} in {client.perf_timer.get_last('total_time')} seconds", From f79f53ced4d38696aaa984de8ef9019da5e1db86 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 25 Sep 2024 19:32:19 -0500 Subject: [PATCH 37/60] Style, lint, changelog --- doc/changelog.md | 9 +- smartsim/_core/_cli/validate.py | 1 + smartsim/_core/entrypoints/service.py | 3 +- .../_core/launcher/dragon/dragonBackend.py | 3 +- .../_core/mli/comm/channel/dragon_channel.py | 1 - .../control/request_dispatcher.py | 14 +-- .../infrastructure/control/worker_manager.py | 8 +- .../worker/tensorflow_worker.py | 90 ++++++++++--------- 8 files changed, 64 insertions(+), 65 deletions(-) diff --git a/doc/changelog.md b/doc/changelog.md index 7d08c9376f..48b773e30a 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Add `TensorFlowWorker` - Quick bug fix in _validate - Add helper methods to MLI classes - Update error handling for consistency @@ -46,7 +47,7 @@ To be released at some future point in time Description -- Allow specifying Model and Ensemble parameters with +- Allow specifying Model and Ensemble parameters with number-like types (e.g. numpy types) - Pin watchdog to 4.x - Update codecov to 4.5.0 @@ -64,9 +65,9 @@ Description Detailed Notes -- The serializer would fail if a parameter for a Model or Ensemble - was specified as a numpy dtype. The constructors for these - methods now validate that the input is number-like and convert +- The serializer would fail if a parameter for a Model or Ensemble + was specified as a numpy dtype. The constructors for these + methods now validate that the input is number-like and convert them to strings ([SmartSim-PR676](https://github.com/CrayLabs/SmartSim/pull/676)) - Pin watchdog to 4.x because v5 introduces new types and requires diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index 6d7c72f172..17cf342978 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -241,6 +241,7 @@ def _test_tf_install(client: Client, tmp_dir: str, device: Device) -> None: def _build_tf_frozen_model(conn: "Connection", tmp_dir: str) -> None: + # pylint: disable-next=no-name-in-module from tensorflow import keras from smartsim.ml.tf import freeze_model diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py index f043a4a230..6b4ef74b67 100644 --- a/smartsim/_core/entrypoints/service.py +++ b/smartsim/_core/entrypoints/service.py @@ -90,8 +90,7 @@ def _log_cooldown(self, elapsed: float) -> None: """Log the remaining cooldown time, if any""" remaining = self._cooldown - elapsed if remaining > 0: - pass - # logger.debug(f"{abs(remaining):.2f}s remains of {self._cooldown}s cooldown") + logger.debug(f"{abs(remaining):.2f}s remains of {self._cooldown}s cooldown") else: logger.info(f"exceeded cooldown {self._cooldown}s by {abs(remaining):.2f}s") diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index b199df4e3e..31f1aaea93 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -865,7 +865,8 @@ def _(self, request: DragonShutdownRequest) -> DragonShutdownResponse: def __del__(self) -> None: try: - self._ddict.destroy() + if self._infra_ddict is not None: + self._infra_ddict.destroy() except Exception: logger.error("Could not destroy Backbone dictionary") diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index f6893220eb..d9254889e3 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -81,7 +81,6 @@ def create_local(capacity: int = 0) -> dch.Channel: logger.error(f"All attempts to attach local channel have failed") raise SmartSimError("Failed to attach local channel") from e - logger.debug(f"Attached to channel id {cid}") return channel diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index cdf651bfef..c61ab8a225 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -368,22 +368,24 @@ def _on_iteration(self) -> None: tensor_bytes_list = bytes_list[1:] self._perf_timer.start_timings() - logger.debug("Deserialzing message") - + request = None try: request = self._worker.deserialize_message( request_bytes, self._callback_factory ) except Exception as exc: - exception_handler(exc, request.callback, "Error deserializing request") + exception_handler(exc, None, "Error deserializing request") + self._perf_timer.end_timings() + return + + if request is None: + exception_handler(exc, None, "Error deserializing request") self._perf_timer.end_timings() return - logger.debug("Deserialized message") + if request.input_meta and tensor_bytes_list: request.raw_inputs = tensor_bytes_list - logger.debug("Assigned data") - self._perf_timer.measure_time("deserialize_message") if not self._validate_request(request): diff --git a/smartsim/_core/mli/infrastructure/control/worker_manager.py b/smartsim/_core/mli/infrastructure/control/worker_manager.py index c2d9b53fba..eef739ca05 100644 --- a/smartsim/_core/mli/infrastructure/control/worker_manager.py +++ b/smartsim/_core/mli/infrastructure/control/worker_manager.py @@ -45,12 +45,7 @@ from ....utils.timings import PerfTimer from ...message_handler import MessageHandler from ..environment_loader import EnvironmentConfigLoader -from ..worker.worker import ( - InferenceReply, - LoadModelResult, - MachineLearningWorkerBase, - RequestBatch, -) +from ..worker.worker import InferenceReply, MachineLearningWorkerBase, RequestBatch from .device_manager import DeviceManager, WorkerDevice from .error_handling import build_failure_reply, exception_handler @@ -326,7 +321,6 @@ def _on_iteration(self) -> None: self._perf_timer.measure_time("send") self._perf_timer.end_timings() - # print(f"**** PROCESSED {self._processed_batches} BATCHES AND {self._sent_responses} REPLIES ****", flush=True) # if self._perf_timer.max_length == 1600: # self._perf_timer.print_timings(True) diff --git a/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py b/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py index e89019a3c1..b9692e153c 100644 --- a/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py @@ -24,24 +24,20 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import io - import logging -import numpy as np -import numpy.typing as npt import os + +import numpy as np import tensorflow as tf -# pylint: disable=import-error -from dragon.managed_memory import MemoryAlloc, MemoryPool -from tensorflow.python.framework.convert_to_constants import ( - convert_var_to_const_function_in_v1, -) +# pylint: disable-next=no-name-in-module from tensorflow.python.framework.ops import disable_eager_execution -tf.get_logger().setLevel(logging.ERROR) +# isort: off +# isort: on -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +# pylint: disable=import-error +from dragon.managed_memory import MemoryAlloc, MemoryPool from .....error import SmartSimError from .....log import get_logger @@ -59,6 +55,9 @@ # pylint: enable=import-error + +tf.get_logger().setLevel(logging.ERROR) +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" logger = get_logger(__name__) disable_eager_execution() @@ -91,44 +90,48 @@ def load_model( graph_def = tf.compat.v1.GraphDef() graph_def.ParseFromString(model_bytes) + # pylint: disable-next=not-context-manager with tf.Graph().as_default() as graph, tf.device(device): tf.import_graph_def(graph_def, name="") - ops = graph.get_operations() - input_layers = [] - for op in ops: - if op.type == "Placeholder": - logger.debug( - "Input op name: {}, output shape : {}".format( - op.name, op.outputs[0].get_shape() - ) - ) - input_layers.append(f"{op.name}:0") - - output_tensors = set() - input_tensors = set() - for op in ops: - for x in op.inputs: - if x.name not in input_tensors: - input_tensors.add(x.name) - for op in ops: - if len(op.outputs) > 0: - x = op.outputs[0] - if x.name not in input_tensors: - logger.debug( - "Output tensor name: {}, tensor shape : {}, parent op type: {}".format( - x.name, x.get_shape(), op.type - ) - ) - output_tensors.add(x.name) - except Exception as e: raise RuntimeError( "Failed to load and evaluate the model: " f"Model key {batch.model_id.key}, Device {device}" ) from e + + input_layers = set() + for operation in ops: + if operation.type == "Placeholder": + logger.debug( + f"Input op name: {operation.name}, " + f"output shape : {operation.outputs[0].get_shape()}" + ) + input_layers.add(f"{operation.name}:0") + + output_tensors = set() + input_tensors = set() + for operation in ops: + for x in operation.inputs: + if x.name not in input_tensors: + input_tensors.add(x.name) + for operation in ops: + if len(operation.outputs) > 0: + x = operation.outputs[0] + if x.name not in input_tensors: + logger.debug( + f"Output tensor name: {x.name}, " + f"tensor shape : {x.get_shape()}, " + f"parent op type: {operation.type}" + ) + output_tensors.add(x.name) + with tf.device(device): - result = LoadModelResult(tf.compat.v1.Session(graph=graph), input_layers, list(output_tensors)) + result = LoadModelResult( + tf.compat.v1.Session(graph=graph), + list(input_layers), + list(output_tensors), + ) return result @staticmethod @@ -249,14 +252,13 @@ def execute( raise ValueError("Error during tensor creation") from e sess = load_result.model + if load_result.inputs is None: + raise ValueError("Model was stored without inputs") try: with tf.device(device): results = sess.run( load_result.outputs, - feed_dict={ - input_layer: tensor - for input_layer, tensor in zip(load_result.inputs, tensors) - }, + feed_dict=dict(zip(load_result.inputs, tensors)), ) except Exception as e: raise ValueError( From f569b9cccfc1dd647427cab8c2fcc7ba956dff46 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 26 Sep 2024 09:06:46 -0500 Subject: [PATCH 38/60] Add tests for TF worker --- tests/dragon/test_tensorflow_worker.py | 222 +++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 tests/dragon/test_tensorflow_worker.py diff --git a/tests/dragon/test_tensorflow_worker.py b/tests/dragon/test_tensorflow_worker.py new file mode 100644 index 0000000000..bb47bcf40a --- /dev/null +++ b/tests/dragon/test_tensorflow_worker.py @@ -0,0 +1,222 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io +import typing as t + +import numpy as np +import pytest +tf = pytest.importorskip("tensorflow") +from tensorflow import keras +from tensorflow.python.framework.convert_to_constants import convert_var_to_const_function_in_v1 + +dragon = pytest.importorskip("dragon") +import dragon.globalservices.pool as dragon_gs_pool +from dragon.managed_memory import MemoryAlloc, MemoryPool + + +from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey +from smartsim._core.mli.infrastructure.worker.tensorflow_worker import TensorFlowWorker +from smartsim._core.mli.infrastructure.worker.worker import ( + ExecuteResult, + FetchInputResult, + FetchModelResult, + InferenceRequest, + LoadModelResult, + RequestBatch, + TransformInputResult, +) +from smartsim._core.mli.message_handler import MessageHandler +from smartsim.log import get_logger + +logger = get_logger(__name__) +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + + + +def get_batch() -> np.typing.ArrayLike: + return np.random.randn(20, 28, 28).astype(np.float32) + +def create_tf_model(): + model = keras.Sequential( + layers=[ + keras.layers.InputLayer(input_shape=(28, 28), name="input"), + keras.layers.Flatten(input_shape=(28, 28), name="flatten"), + keras.layers.Dense(128, activation="relu", name="dense"), + keras.layers.Dense(10, activation="softmax", name="output"), + ], + name="FCN", + ) + + # Compile model with optimizer + model.compile( + optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] + ) + + + real_model = tf.function(model).get_concrete_function( + tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype) + ) + with tf.compat.v1.Session() as sess: + ffunc = convert_var_to_const_function_in_v1(real_model) + graph_def_orig = ffunc.graph.as_graph_def() + + graph_def_str = graph_def_orig.SerializeToString() + + names = lambda l: [x.name for x in l] + + return graph_def_str, names(ffunc.inputs), names(ffunc.outputs) + +tensorflow_device = {"cpu": "/CPU", "gpu": "/GPU"} + + +def get_request() -> InferenceRequest: + + tensors = [get_batch()] + serialized_tensors_descriptors = [ + MessageHandler.build_tensor_descriptor("c", "float32", list(tensor.shape)) + for tensor in tensors + ] + + return InferenceRequest( + model_key=FeatureStoreKey(key="model", descriptor="xyz"), + callback=None, + raw_inputs=tensors, + input_keys=None, + input_meta=serialized_tensors_descriptors, + output_keys=None, + raw_model=create_tf_model()[0], + batch_size=0, + ) + + +def get_request_batch_from_request( + request: InferenceRequest, inputs: t.Optional[TransformInputResult] = None +) -> RequestBatch: + + return RequestBatch([request], inputs, request.model_key) + + +sample_request: InferenceRequest = get_request() +sample_request_batch: RequestBatch = get_request_batch_from_request(sample_request) +worker = TensorFlowWorker() + + +def test_load_model(mlutils) -> None: + fetch_model_result = FetchModelResult(sample_request.raw_model) + load_model_result = worker.load_model( + sample_request_batch, fetch_model_result, mlutils.get_test_device().lower() + ) + + with tf.device(tensorflow_device[mlutils.get_test_device().lower()]): + results = load_model_result.model.run( + load_model_result.outputs, + feed_dict=dict(zip(load_model_result.inputs, [get_batch()])), + ) + + assert results[0].shape == (20,10) + + +def test_transform_input(mlutils) -> None: + fetch_input_result = FetchInputResult( + sample_request.raw_inputs, sample_request.input_meta + ) + + mem_pool = MemoryPool.attach(dragon_gs_pool.create(1024**2).sdesc) + + transform_input_result = worker.transform_input( + sample_request_batch, [fetch_input_result], mem_pool + ) + + batch = get_batch() + assert transform_input_result.slices[0] == slice(0, batch.shape[0]) + + tensor_index = 0 + assert tuple(transform_input_result.dims[tensor_index]) == batch.shape + assert transform_input_result.dtypes[tensor_index] == str(batch.dtype) + mem_alloc = MemoryAlloc.attach(transform_input_result.transformed[tensor_index]) + itemsize = batch.itemsize + tensor = np.frombuffer( + mem_alloc.get_memview()[ + 0 : np.prod(transform_input_result.dims[tensor_index]) * itemsize + ], + dtype=transform_input_result.dtypes[tensor_index], + ).reshape(transform_input_result.dims[tensor_index]) + + np.testing.assert_allclose(tensor, sample_request.raw_inputs[tensor_index]) + + mem_pool.destroy() + + +def test_execute(mlutils) -> None: + + graph_def_str, inputs, outputs = create_tf_model() + graph_def = tf.compat.v1.GraphDef() + graph_def.ParseFromString(graph_def_str) + + with tf.Graph().as_default() as graph: + tf.import_graph_def(graph_def, name="") + load_model_result = LoadModelResult( + tf.compat.v1.Session(graph=graph), inputs=inputs, outputs=outputs + ) + + fetch_input_result = FetchInputResult( + sample_request.raw_inputs, sample_request.input_meta + ) + + request_batch = get_request_batch_from_request(sample_request, fetch_input_result) + + mem_pool = MemoryPool.attach(dragon_gs_pool.create(1024**2).sdesc) + + transform_result = worker.transform_input( + request_batch, [fetch_input_result], mem_pool + ) + + execute_result = worker.execute( + request_batch, + load_model_result, + transform_result, + mlutils.get_test_device().lower(), + ) + + assert all( + result.shape == (20, 10) for result in execute_result.predictions + ) + + mem_pool.destroy() + + +def test_transform_output(mlutils): + tensors = [np.zeros((20, 10))] + execute_result = ExecuteResult(tensors, [slice(0, 20)]) + + transformed_output = worker.transform_output(sample_request_batch, execute_result) + + assert transformed_output[0].outputs == [item.tobytes() for item in tensors] + assert transformed_output[0].shape == None + assert transformed_output[0].order == "c" + assert transformed_output[0].dtype == "float32" From 50ce6adbe60bed24109326d6545c02c61a566698 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 26 Sep 2024 09:13:05 -0500 Subject: [PATCH 39/60] Improve standalone_worker_manager.py --- .../standalone_worker_manager.py | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py index f92f199d2e..07fedc029e 100644 --- a/ex/high_throughput_inference/standalone_worker_manager.py +++ b/ex/high_throughput_inference/standalone_worker_manager.py @@ -25,9 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import dragon - -# pylint disable=import-error +# pylint: disable=import-error import dragon.infrastructure.policy as dragon_policy import dragon.infrastructure.process_desc as dragon_process_desc import dragon.native.process as dragon_process @@ -35,10 +33,8 @@ from dragon.channels import Channel from dragon.data.ddict.ddict import DDict from dragon.globalservices.api_setup import connect_to_infrastructure -from dragon.managed_memory import MemoryPool -from dragon.utils import b64decode, b64encode -# pylint enable=import-error +# pylint: enable=import-error # isort: off # isort: on @@ -46,18 +42,13 @@ import argparse import base64 import multiprocessing as mp -import optparse import os -import pickle import socket -import sys import time -import typing as t import cloudpickle from smartsim._core.entrypoints.service import Service -from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel from smartsim._core.mli.infrastructure.control.request_dispatcher import ( @@ -68,7 +59,6 @@ from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase from smartsim.log import get_logger logger = get_logger("Worker Manager Entry Point") @@ -125,13 +115,15 @@ def service_as_dragon_proc( "--batch_size", type=int, default=1, - help="How many requests the workers will try to aggregate before processing them", + help="How many requests the workers will try " + "to aggregate before processing them", ) parser.add_argument( "--batch_timeout", type=float, default=0.001, - help="How much time (in seconds) should be waited before processing an incomplete aggregated request", + help="How much time (in seconds) should be waited " + "before processing an incomplete aggregated request", ) args = parser.parse_args() From d49634fb898df09cf57ee20f3147362411fc5bd7 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 26 Sep 2024 09:16:46 -0500 Subject: [PATCH 40/60] Remove commented sections --- .../infrastructure/control/request_dispatcher.py | 2 -- .../mli/infrastructure/control/worker_manager.py | 1 - tests/dragon/test_tensorflow_worker.py | 16 ++++++++-------- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index c61ab8a225..8d0cb376f5 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -405,8 +405,6 @@ def _on_iteration(self) -> None: self._perf_timer.end_timings() - # if self._perf_timer.max_length == 801 and self._perf_timer.is_active: - # self._perf_timer.print_timings(True) def remove_queues(self) -> None: """Remove references to queues that can be removed diff --git a/smartsim/_core/mli/infrastructure/control/worker_manager.py b/smartsim/_core/mli/infrastructure/control/worker_manager.py index eef739ca05..ec04940c3a 100644 --- a/smartsim/_core/mli/infrastructure/control/worker_manager.py +++ b/smartsim/_core/mli/infrastructure/control/worker_manager.py @@ -182,7 +182,6 @@ def _on_iteration(self) -> None: return self._processed_batches += 1 - # print(f"**** PROCESSING BATCH {self._processed_batches} ****", flush=True) self._perf_timer.start_timings( "flush_requests", time.perf_counter() - pre_batch_time diff --git a/tests/dragon/test_tensorflow_worker.py b/tests/dragon/test_tensorflow_worker.py index bb47bcf40a..9bbbd0090f 100644 --- a/tests/dragon/test_tensorflow_worker.py +++ b/tests/dragon/test_tensorflow_worker.py @@ -29,15 +29,17 @@ import numpy as np import pytest + tf = pytest.importorskip("tensorflow") from tensorflow import keras -from tensorflow.python.framework.convert_to_constants import convert_var_to_const_function_in_v1 +from tensorflow.python.framework.convert_to_constants import ( + convert_var_to_const_function_in_v1, +) dragon = pytest.importorskip("dragon") import dragon.globalservices.pool as dragon_gs_pool from dragon.managed_memory import MemoryAlloc, MemoryPool - from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey from smartsim._core.mli.infrastructure.worker.tensorflow_worker import TensorFlowWorker from smartsim._core.mli.infrastructure.worker.worker import ( @@ -57,10 +59,10 @@ pytestmark = pytest.mark.dragon - def get_batch() -> np.typing.ArrayLike: return np.random.randn(20, 28, 28).astype(np.float32) + def create_tf_model(): model = keras.Sequential( layers=[ @@ -77,7 +79,6 @@ def create_tf_model(): optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] ) - real_model = tf.function(model).get_concrete_function( tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype) ) @@ -91,6 +92,7 @@ def create_tf_model(): return graph_def_str, names(ffunc.inputs), names(ffunc.outputs) + tensorflow_device = {"cpu": "/CPU", "gpu": "/GPU"} @@ -138,7 +140,7 @@ def test_load_model(mlutils) -> None: feed_dict=dict(zip(load_model_result.inputs, [get_batch()])), ) - assert results[0].shape == (20,10) + assert results[0].shape == (20, 10) def test_transform_input(mlutils) -> None: @@ -203,9 +205,7 @@ def test_execute(mlutils) -> None: mlutils.get_test_device().lower(), ) - assert all( - result.shape == (20, 10) for result in execute_result.predictions - ) + assert all(result.shape == (20, 10) for result in execute_result.predictions) mem_pool.destroy() From a8d501b26afbc283352914ef35c603ea6346ec60 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 26 Sep 2024 09:17:20 -0500 Subject: [PATCH 41/60] Style --- smartsim/_core/mli/infrastructure/control/request_dispatcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index 8d0cb376f5..ef83411221 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -405,7 +405,6 @@ def _on_iteration(self) -> None: self._perf_timer.end_timings() - def remove_queues(self) -> None: """Remove references to queues that can be removed and allow them to be garbage collected.""" From 79b49544eafaaaed6bf4e1abeef86865bae6af4f Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 26 Sep 2024 12:31:22 -0500 Subject: [PATCH 42/60] Switch to Channel.make_process_local --- ex/high_throughput_inference/mock_app.py | 7 +++---- ex/high_throughput_inference/mock_app_tensorflow.py | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index d6977e5a7b..e2165b67a0 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -77,12 +77,12 @@ def __init__(self, timing_on: bool): self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str) except KeyError: time.sleep(1) - self._from_worker_ch = DragonCommChannel(create_local()) + self._from_worker_ch = DragonCommChannel(Channel.make_process_local()) self._from_worker_ch_serialized = self._from_worker_ch.descriptor_string self._to_worker_ch = Channel.make_process_local() self.perf_timer: PerfTimer = PerfTimer( - debug=True, timing_on=timing_on, prefix=f"a{self._rank}_" + debug=False, timing_on=timing_on, prefix=f"a{self._rank}_" ) self._num_its: int = 0 @@ -191,13 +191,12 @@ def log(msg: str, rank: int) -> None: MPI.COMM_WORLD.Barrier() - TOTAL_ITERATIONS = 10 + TOTAL_ITERATIONS = 100 for log2_bsize in range(args.log_max_batchsize, args.log_max_batchsize + 1): b_size: int = 2**log2_bsize log(f"Batch size: {b_size}", client._rank) for iteration_number in range(TOTAL_ITERATIONS): - # log(f"Iteration: {iteration_number}", client._rank) sample_batch = resnet.get_batch(b_size) remote_result = client.run_model(resnet.name, sample_batch) log( diff --git a/ex/high_throughput_inference/mock_app_tensorflow.py b/ex/high_throughput_inference/mock_app_tensorflow.py index d70991980f..b7fa104e20 100644 --- a/ex/high_throughput_inference/mock_app_tensorflow.py +++ b/ex/high_throughput_inference/mock_app_tensorflow.py @@ -75,7 +75,7 @@ def __init__(self, timing_on: bool): self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str) except KeyError: time.sleep(1) - self._from_worker_ch = DragonCommChannel(create_local()) + self._from_worker_ch = DragonCommChannel(Channel.make_process_local()) self._from_worker_ch_serialized = self._from_worker_ch.descriptor_string self._to_worker_ch = Channel.make_process_local() @@ -189,7 +189,7 @@ def log(msg: str, rank: int) -> None: MPI.COMM_WORLD.Barrier() - TOTAL_ITERATIONS = 10 + TOTAL_ITERATIONS = 100 for log2_bsize in range(args.log_max_batchsize, args.log_max_batchsize + 1): b_size: int = 2**log2_bsize From 152d434c5a958335ee334d0892a0febbf0bd44a5 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 26 Sep 2024 13:33:08 -0500 Subject: [PATCH 43/60] Add timeout and exc handling to WM response --- .../control/request_dispatcher.py | 4 ++-- .../infrastructure/control/worker_manager.py | 21 +++++++++++-------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index ef83411221..c449ed0f21 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -235,7 +235,7 @@ def __init__( raise SmartSimError("No incoming channel for dispatcher") self._incoming_channel = incoming_channel """The channel the dispatcher monitors for new tasks""" - self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0) + self._outgoing_queue: DragonQueue = mp.Queue(maxsize=10000) """The queue on which batched inference requests are placed""" self._feature_stores: t.Dict[str, FeatureStore] = {} """A collection of attached feature stores""" @@ -250,7 +250,7 @@ def __init__( """The worker used to batch inputs""" self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(mem_pool_size).sdesc) """Memory pool used to share batched input tensors with the Worker Managers""" - self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True) + self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True) """Performance timer""" @property diff --git a/smartsim/_core/mli/infrastructure/control/worker_manager.py b/smartsim/_core/mli/infrastructure/control/worker_manager.py index ec04940c3a..15e20829b3 100644 --- a/smartsim/_core/mli/infrastructure/control/worker_manager.py +++ b/smartsim/_core/mli/infrastructure/control/worker_manager.py @@ -101,7 +101,7 @@ def __init__( information among MLI components""" self._device_manager: t.Optional[DeviceManager] = None """Object responsible for model caching and device access""" - self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) + self._perf_timer = PerfTimer(prefix="w_", debug=True, timing_on=True) """Performance timer""" self._processed_batches: int = 0 """Number of processed request batches""" @@ -312,18 +312,21 @@ def _on_iteration(self) -> None: self._perf_timer.measure_time("serialize_resp") if request.callback: - request.callback.send(serialized_resp) - if reply.has_outputs: - # send tensor data after response - for output in reply.outputs: - request.callback.send(output) + try: + request.callback.send(serialized_resp) + if reply.has_outputs: + # send tensor data after response + for output in reply.outputs: + request.callback.send(output, timeout=None) + except Exception as e: + exception_handler( + e, request.callback, "Error while sending response." + ) + continue self._perf_timer.measure_time("send") self._perf_timer.end_timings() - # if self._perf_timer.max_length == 1600: - # self._perf_timer.print_timings(True) - def _can_shutdown(self) -> bool: """Determine if the service can be shutdown. From 14b20f10786dfeaccd908d241022ddb9fa398fa6 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 26 Sep 2024 13:38:39 -0500 Subject: [PATCH 44/60] More commented lines to remove --- ex/high_throughput_inference/mock_app.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index e2165b67a0..39a3ee02ff 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -115,7 +115,6 @@ def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor): self.perf_timer.measure_time("send_request") for tensor in tensors: to_sendh.send_bytes(tensor.tobytes()) # TODO NOT FAST ENOUGH!!! - # logger.info(f"{self._rank} sent tensors") self.perf_timer.measure_time("send_tensors") resp = self._from_worker_ch.recv(timeout=None) @@ -138,10 +137,7 @@ def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor): self.perf_timer.end_timings() self._num_its += 1 - # logger.info(f"{self._rank} got to the barrier {self._num_its}") self._comm.Barrier() - # time.sleep(0.01) - # logger.info(f"{self._rank} made it past the barrier {self._num_its}") return result def set_model(self, key: str, model: bytes): @@ -153,8 +149,6 @@ def __init__(self, name: str, model: str): self._model = None # torch.jit.load(model) self._name = name - # scripted = torch.jit.trace(self._model, self.get_batch()) - # torch.jit.save(scripted, buffer) with open(model, "rb") as model_file: buffer = io.BytesIO(model_file.read()) self._serialized_model = buffer.getvalue() From dbbdcd9d1fc6c169bda1b6f0d0c10a05461697e9 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 26 Sep 2024 13:57:44 -0500 Subject: [PATCH 45/60] Remove debug information --- smartsim/_core/mli/comm/channel/dragon_channel.py | 1 - .../_core/mli/infrastructure/control/request_dispatcher.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index d9254889e3..e528cae637 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -182,7 +182,6 @@ def from_descriptor( channel = dch.Channel.attach(actual_descriptor) return DragonCommChannel(channel) except Exception as ex: - logger.debug(f"Failed to create dragon comm channel: {descriptor!r}, {ex}") raise SmartSimError( f"Failed to create dragon comm channel: {descriptor!r}" ) from ex diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index c449ed0f21..b232c663aa 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -250,7 +250,7 @@ def __init__( """The worker used to batch inputs""" self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(mem_pool_size).sdesc) """Memory pool used to share batched input tensors with the Worker Managers""" - self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True) + self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True) """Performance timer""" @property @@ -353,7 +353,6 @@ def _on_iteration(self) -> None: try: self._perf_timer.is_active = True bytes_list: t.List[bytes] = self._incoming_channel.recv() - logger.debug("Received data") except Exception: self._perf_timer.is_active = False else: From b9bdf99478368cbdab16af391b8f0fe07442ab81 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 27 Sep 2024 10:19:11 -0500 Subject: [PATCH 46/60] Fix issue when keras layer has "resource" field --- .../mli/infrastructure/control/worker_manager.py | 2 +- .../infrastructure/worker/tensorflow_worker.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/worker_manager.py b/smartsim/_core/mli/infrastructure/control/worker_manager.py index 15e20829b3..db227a3ac4 100644 --- a/smartsim/_core/mli/infrastructure/control/worker_manager.py +++ b/smartsim/_core/mli/infrastructure/control/worker_manager.py @@ -101,7 +101,7 @@ def __init__( information among MLI components""" self._device_manager: t.Optional[DeviceManager] = None """Object responsible for model caching and device access""" - self._perf_timer = PerfTimer(prefix="w_", debug=True, timing_on=True) + self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) """Performance timer""" self._processed_batches: int = 0 """Number of processed request batches""" diff --git a/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py b/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py index b9692e153c..a9b4abcadd 100644 --- a/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py @@ -105,23 +105,28 @@ def load_model( if operation.type == "Placeholder": logger.debug( f"Input op name: {operation.name}, " - f"output shape : {operation.outputs[0].get_shape()}" + f"output shape: {operation.outputs[0].get_shape()}" ) input_layers.add(f"{operation.name}:0") + + # Code initially taken from + # apple.github.io/coremltools/docs-guides/source/tensorflow-1-workflow.html output_tensors = set() input_tensors = set() for operation in ops: for x in operation.inputs: - if x.name not in input_tensors: - input_tensors.add(x.name) + input_tensors.add(x.name) for operation in ops: if len(operation.outputs) > 0: x = operation.outputs[0] - if x.name not in input_tensors: + potential_names = [x.name] + name_split = x.name.split(":") + potential_names.append(":".join((name_split[0]+"/resource", name_split[-1]))) + if all(name not in input_tensors for name in potential_names): logger.debug( f"Output tensor name: {x.name}, " - f"tensor shape : {x.get_shape()}, " + f"tensor shape: {x.get_shape()}, " f"parent op type: {operation.type}" ) output_tensors.add(x.name) From 3f571f47f35debd2a73ce5e19cef49f8958db9ea Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 2 Oct 2024 14:41:15 -0500 Subject: [PATCH 47/60] Address first comments. --- smartsim/_core/mli/comm/channel/channel.py | 4 ++-- smartsim/_core/mli/comm/channel/dragon_channel.py | 4 ++-- smartsim/_core/mli/comm/channel/dragon_fli.py | 4 ++-- .../mli/infrastructure/worker/tensorflow_worker.py | 10 ++++++---- .../_core/mli/infrastructure/worker/torch_worker.py | 1 + 5 files changed, 13 insertions(+), 10 deletions(-) diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index 9a12e4c8dc..b56f443f44 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -44,7 +44,7 @@ def __init__(self, descriptor: t.Union[str, bytes]) -> None: self._descriptor = descriptor @abstractmethod - def send(self, value: bytes, timeout: float = 0) -> None: + def send(self, value: bytes, timeout: t.Optional[float] = 0) -> None: """Send a message through the underlying communication channel. :param timeout: Maximum time to wait (in seconds) for messages to send @@ -53,7 +53,7 @@ def send(self, value: bytes, timeout: float = 0) -> None: """ @abstractmethod - def recv(self, timeout: float = 0) -> t.List[bytes]: + def recv(self, timeout: t.Optional[float] = 0) -> t.List[bytes]: """Receives message(s) through the underlying communication channel. :param timeout: Maximum time to wait (in seconds) for messages to arrive diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index e528cae637..c607361a23 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -105,7 +105,7 @@ def channel(self) -> "dch.Channel": """ return self._channel - def send(self, value: bytes, timeout: float = 0.001) -> None: + def send(self, value: bytes, timeout: t.Optional[float] = 0.001) -> None: """Send a message through the underlying communication channel. :param value: The value to send @@ -121,7 +121,7 @@ def send(self, value: bytes, timeout: float = 0.001) -> None: f"Error sending message: DragonCommChannel {self.descriptor!r}" ) from e - def recv(self, timeout: float = 0.001) -> t.List[bytes]: + def recv(self, timeout: t.Optional[float] = 0.001) -> t.List[bytes]: """Receives message(s) through the underlying communication channel. :param timeout: Maximum time to wait (in seconds) for messages to arrive diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index 5f82b82abe..de79468aee 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -66,7 +66,7 @@ def __init__( create_local(buffer_size) if sender_supplied else None ) - def send(self, value: bytes, timeout: float = 0.001) -> None: + def send(self, value: bytes, timeout: t.Optional[float] = 0.001) -> None: """Send a message through the underlying communication channel. :param timeout: Maximum time to wait (in seconds) for messages to send @@ -82,7 +82,7 @@ def send(self, value: bytes, timeout: float = 0.001) -> None: f"Error sending message: DragonFLIChannel {self.descriptor!r}" ) from e - def recv(self, timeout: float = 0.001) -> t.List[bytes]: + def recv(self, timeout: t.Optional[float] = 0.001) -> t.List[bytes]: """Receives message(s) through the underlying communication channel. :param timeout: Maximum time to wait (in seconds) for messages to arrive diff --git a/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py b/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py index a9b4abcadd..c6547317aa 100644 --- a/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py @@ -74,6 +74,7 @@ def load_model( device memory. :param request: The request that triggered the pipeline + :param fetch_result: Raw outputs from fetching model :param device: The device on which the model must be placed :returns: LoadModelResult wrapping the model loaded for the request :raises ValueError: If model reference object is not found @@ -109,7 +110,6 @@ def load_model( ) input_layers.add(f"{operation.name}:0") - # Code initially taken from # apple.github.io/coremltools/docs-guides/source/tensorflow-1-workflow.html output_tensors = set() @@ -122,7 +122,9 @@ def load_model( x = operation.outputs[0] potential_names = [x.name] name_split = x.name.split(":") - potential_names.append(":".join((name_split[0]+"/resource", name_split[-1]))) + potential_names.append( + ":".join((name_split[0] + "/resource", name_split[-1])) + ) if all(name not in input_tensors for name in potential_names): logger.debug( f"Output tensor name: {x.name}, " @@ -148,8 +150,8 @@ def transform_input( """Given a collection of data, perform a transformation on the data and put the raw tensor data on a MemoryPool allocation. - :param request: The request that triggered the pipeline - :param fetch_result: Raw outputs from fetching inputs out of a feature store + :param batch: The request batch that triggered the pipeline + :param fetch_result: Raw outputs from fetching inputs :param mem_pool: The memory pool used to access batched input tensors :returns: The transformed inputs wrapped in a TransformInputResult :raises ValueError: If tensors cannot be reconstructed diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 64e94e5eb6..5fd4bc4736 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -65,6 +65,7 @@ def load_model( device memory. :param request: The request that triggered the pipeline + :param fetch_result: Raw outputs from fetching model :param device: The device on which the model must be placed :returns: LoadModelResult wrapping the model loaded for the request :raises ValueError: If model reference object is not found From dc7307a5a0347664431bbfeed105e71e2a19f3be Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 3 Oct 2024 12:55:21 -0500 Subject: [PATCH 48/60] Add ONNX worker --- .../mli/infrastructure/worker/onnx_worker.py | 288 ++++++++++++++++++ .../worker/tensorflow_worker.py | 4 + tests/dragon/test_onnx_worker.py | 215 +++++++++++++ 3 files changed, 507 insertions(+) create mode 100644 smartsim/_core/mli/infrastructure/worker/onnx_worker.py create mode 100644 tests/dragon/test_onnx_worker.py diff --git a/smartsim/_core/mli/infrastructure/worker/onnx_worker.py b/smartsim/_core/mli/infrastructure/worker/onnx_worker.py new file mode 100644 index 0000000000..47cb11d936 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/worker/onnx_worker.py @@ -0,0 +1,288 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import logging +import os + +import numpy as np +from onnx import load_model_from_string +from onnxruntime import InferenceSession # type: ignore + +# isort: off +# isort: on + +# pylint: disable=import-error +from dragon.managed_memory import MemoryAlloc, MemoryPool + +# pylint: enable=import-error + +from .....error import SmartSimError +from .....log import get_logger +from ...mli_schemas.tensor import tensor_capnp +from .worker import ( + ExecuteResult, + FetchInputResult, + FetchModelResult, + LoadModelResult, + MachineLearningWorkerBase, + RequestBatch, + TransformInputResult, + TransformOutputResult, +) + + +logger = get_logger(__name__) + + +class ONNXWorker(MachineLearningWorkerBase): + """A worker that executes an ONNX model.""" + + @staticmethod + def load_model( + batch: RequestBatch, fetch_result: FetchModelResult, device: str + ) -> LoadModelResult: + """Given a loaded MachineLearningModel, ensure it is loaded into + device memory. + + :param request: The request that triggered the pipeline + :param fetch_result: Raw outputs from fetching model + :param device: The device on which the model must be placed + :returns: LoadModelResult wrapping the model loaded for the request + :raises ValueError: If model reference object is not found + :raises RuntimeError: If loading and evaluating the model failed + """ + if fetch_result.model_bytes: + model_bytes = fetch_result.model_bytes + elif batch.raw_model and batch.raw_model.data: + model_bytes = batch.raw_model.data + else: + raise ValueError("Unable to load model without reference object") + + try: + providers = [] + if "gpu" in device.lower(): + device_split = device.split(":") + if len(device_split) > 1: + provider_options = {"device_id": device_split[-1]} + if "ROCR_VISIBLE_DEVICES" in os.environ: + providers = [("ROCMExecutionProvider", provider_options)] + else: + providers = [("CUDAExecutionProvider", provider_options)] + + # Fallback + providers.append(("CPUExecutionProvider", {})) + + except Exception as e: + raise RuntimeError( + "Failed to load and evaluate the model: " + f"Model key {batch.model_id.key}, Device {device}" + ) from e + + onnx_deserialized = load_model_from_string(model_bytes) + output_tensors = [n.name for n in onnx_deserialized.graph.output] + input_layers = [n.name for n in onnx_deserialized.graph.input] + + session = InferenceSession( + model_bytes, providers=providers + ) + result = LoadModelResult( + session, + input_layers, + output_tensors, + ) + return result + + @staticmethod + def transform_input( + batch: RequestBatch, + fetch_results: list[FetchInputResult], + mem_pool: MemoryPool, + ) -> TransformInputResult: + """Given a collection of data, perform a transformation on the data and put + the raw tensor data on a MemoryPool allocation. + + :param batch: The request batch that triggered the pipeline + :param fetch_result: Raw outputs from fetching inputs + :param mem_pool: The memory pool used to access batched input tensors + :returns: The transformed inputs wrapped in a TransformInputResult + :raises ValueError: If tensors cannot be reconstructed + :raises IndexError: If index out of range + """ + results: list[memoryview] = [] + total_samples = 0 + slices: list[slice] = [] + + all_dims: list[list[int]] = [] + all_dtypes: list[str] = [] + if fetch_results[0].meta is None: + raise ValueError("Cannot reconstruct tensor without meta information") + # Traverse inputs to get total number of samples and compute slices + # Assumption: first dimension is samples, all tensors in the same input + # have same number of samples + # thus we only look at the first tensor for each input + for res_idx, fetch_result in enumerate(fetch_results): + if fetch_result.meta is None or any( + item_meta is None for item_meta in fetch_result.meta + ): + raise ValueError("Cannot reconstruct tensor without meta information") + first_tensor_desc: tensor_capnp.TensorDescriptor = fetch_result.meta[0] + num_samples = first_tensor_desc.dimensions[0] + slices.append(slice(total_samples, total_samples + num_samples)) + total_samples = total_samples + num_samples + + if res_idx == len(fetch_results) - 1: + # For each tensor in the last input, get remaining dimensions + # Assumptions: all inputs have the same number of tensors and + # last N-1 dimensions match across inputs for corresponding tensors + # thus: resulting array will be of size (num_samples, all_other_dims) + for item_meta in fetch_result.meta: + tensor_desc: tensor_capnp.TensorDescriptor = item_meta + tensor_dims = list(tensor_desc.dimensions) + all_dims.append([total_samples, *tensor_dims[1:]]) + all_dtypes.append(str(tensor_desc.dataType)) + + for result_tensor_idx, (dims, dtype) in enumerate(zip(all_dims, all_dtypes)): + itemsize = np.empty((1), dtype=dtype).itemsize + alloc_size = int(np.prod(dims) * itemsize) + mem_alloc = mem_pool.alloc(alloc_size) + mem_view = mem_alloc.get_memview() + try: + joined = b"".join( + [ + fetch_result.inputs[result_tensor_idx] + for fetch_result in fetch_results + ] + ) + mem_view[:alloc_size] = joined + except IndexError as e: + raise IndexError( + "Error accessing elements in fetch_result.inputs " + f"with index {result_tensor_idx}" + ) from e + + results.append(mem_alloc.serialize()) + + return TransformInputResult(results, slices, all_dims, all_dtypes) + + # pylint: disable-next=unused-argument + @staticmethod + def execute( + batch: RequestBatch, + load_result: LoadModelResult, + transform_result: TransformInputResult, + device: str, + ) -> ExecuteResult: + """Execute an ML model on inputs transformed for use by the model. + + :param batch: The batch of requests that triggered the pipeline + :param load_result: The result of loading the model onto device memory + :param transform_result: The result of transforming inputs for model consumption + :param device: The device on which the model will be executed + :returns: The result of inference wrapped in an ExecuteResult + :raises SmartSimError: If model is not loaded + :raises IndexError: If memory slicing is out of range + :raises ValueError: If tensor creation fails or is unable to evaluate the model + """ + if not load_result.model: + raise SmartSimError("Model must be loaded to execute") + + tensors = [] + mem_allocs = [] + for transformed, dims, dtype in zip( + transform_result.transformed, transform_result.dims, transform_result.dtypes + ): + mem_alloc = MemoryAlloc.attach(transformed) + mem_allocs.append(mem_alloc) + itemsize = np.empty((1), dtype=dtype).itemsize + try: + tensors.append( + np.frombuffer( + mem_alloc.get_memview()[0 : np.prod(dims) * itemsize], + dtype=dtype, + ).reshape(dims) + ) + except IndexError as e: + raise IndexError("Error during memory slicing") from e + except Exception as e: + raise ValueError("Error during tensor creation") from e + + sess = load_result.model + if load_result.inputs is None: + raise ValueError("Model was stored without inputs") + try: + results = sess.run( + load_result.outputs, + input_feed=dict(zip(load_result.inputs, tensors)), + ) + except Exception as e: + raise ValueError( + f"Error while evaluating the model: Model {batch.model_id.key}" + ) from e + + transform_result.transformed = [] + + execute_result = ExecuteResult(results, transform_result.slices) + for mem_alloc in mem_allocs: + mem_alloc.free() + return execute_result + + @staticmethod + def transform_output( + batch: RequestBatch, + execute_result: ExecuteResult, + ) -> list[TransformOutputResult]: + """Given inference results, perform transformations required to + transmit results to the requestor. + + :param batch: The batch of requests that triggered the pipeline + :param execute_result: The result of inference wrapped in an ExecuteResult + :returns: A list of transformed outputs + :raises IndexError: If indexing is out of range + :raises ValueError: If transforming output fails + """ + transformed_list: list[TransformOutputResult] = [] + cpu_predictions = execute_result.predictions + + for result_slice in execute_result.slices: + transformed = [] + for cpu_item in cpu_predictions: + try: + transformed.append(cpu_item[result_slice].tobytes()) + + # todo: need the shape from latest schemas added here. + transformed_list.append( + TransformOutputResult(transformed, None, "c", "float32") + ) # fixme + except IndexError as e: + raise IndexError( + f"Error accessing elements: result_slice {result_slice}" + ) from e + except Exception as e: + raise ValueError("Error transforming output") from e + + execute_result.predictions = [] + + return transformed_list diff --git a/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py b/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py index c6547317aa..d532cc160b 100644 --- a/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py @@ -87,6 +87,10 @@ def load_model( else: raise ValueError("Unable to load model without reference object") + device_to_tf = {"cpu": "/CPU", "gpu": "/GPU"} + for old, new in device_to_tf.items(): + device = device.replace(old, new) + try: graph_def = tf.compat.v1.GraphDef() graph_def.ParseFromString(model_bytes) diff --git a/tests/dragon/test_onnx_worker.py b/tests/dragon/test_onnx_worker.py new file mode 100644 index 0000000000..bd611194d2 --- /dev/null +++ b/tests/dragon/test_onnx_worker.py @@ -0,0 +1,215 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io +import typing as t + +import numpy as np +import numpy.typing as npt +import pytest + +onnx = pytest.importorskip("onnx") +from onnx import load_model_from_string +pytest.importorskip("onnxruntime") +from onnxruntime import InferenceSession +dragon = pytest.importorskip("dragon") + +from sklearn.preprocessing import PolynomialFeatures +from sklearn.linear_model import LinearRegression + +from skl2onnx import to_onnx + +import dragon.globalservices.pool as dragon_gs_pool +from dragon.managed_memory import MemoryAlloc, MemoryPool + +from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey +from smartsim._core.mli.infrastructure.worker.onnx_worker import ONNXWorker +from smartsim._core.mli.infrastructure.worker.worker import ( + ExecuteResult, + FetchInputResult, + FetchModelResult, + InferenceRequest, + LoadModelResult, + RequestBatch, + TransformInputResult, +) +from smartsim._core.mli.message_handler import MessageHandler +from smartsim.log import get_logger + +logger = get_logger(__name__) +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + + +def get_X() -> npt.ArrayLike: + return np.linspace(0, 10, 10).astype(np.float32) + +def get_poly_features() -> npt.ArrayLike: + + poly = PolynomialFeatures(degree=2, include_bias=False) + return poly.fit_transform(get_X().reshape(-1, 1)) + +def get_Y() -> npt.ArrayLike: + p = np.polynomial.Polynomial([1.4, -10, 4]) + return p(get_X()) + +def create_onnx_model(): + + poly_features = get_poly_features() + poly_reg_model = LinearRegression() + + poly_reg_model.fit(poly_features, get_Y()) + onnx_model = to_onnx(poly_reg_model, poly_features, target_opset=13) + + onnx_serialized = onnx_model.SerializeToString() + output_names = [n.name for n in onnx_model.graph.output] + input_names = [n.name for n in onnx_model.graph.input] + + return onnx_serialized, input_names, output_names + + + +def get_request() -> InferenceRequest: + + tensors = [get_poly_features()] + serialized_tensors_descriptors = [ + MessageHandler.build_tensor_descriptor("c", "float32", list(tensor.shape)) + for tensor in tensors + ] + + return InferenceRequest( + model_key=FeatureStoreKey(key="model", descriptor="xyz"), + callback=None, + raw_inputs=tensors, + input_keys=None, + input_meta=serialized_tensors_descriptors, + output_keys=None, + raw_model=create_onnx_model()[0], + batch_size=0, + ) + + +def get_request_batch_from_request( + request: InferenceRequest, inputs: t.Optional[TransformInputResult] = None +) -> RequestBatch: + + return RequestBatch([request], inputs, request.model_key) + + +sample_request: InferenceRequest = get_request() +sample_request_batch: RequestBatch = get_request_batch_from_request(sample_request) +worker = ONNXWorker() + + +def test_load_model(mlutils) -> None: + fetch_model_result = FetchModelResult(sample_request.raw_model) + load_model_result = worker.load_model( + sample_request_batch, fetch_model_result, mlutils.get_test_device().lower() + ) + + results = load_model_result.model.run( + load_model_result.outputs, + input_feed=dict(zip(load_model_result.inputs, [get_poly_features()])), + ) + + assert results[0].shape == (10, 1) + + +def test_transform_input(mlutils) -> None: + fetch_input_result = FetchInputResult( + sample_request.raw_inputs, sample_request.input_meta + ) + + mem_pool = MemoryPool.attach(dragon_gs_pool.create(1024**2).sdesc) + + transform_input_result = worker.transform_input( + sample_request_batch, [fetch_input_result], mem_pool + ) + + batch = get_poly_features() + assert transform_input_result.slices[0] == slice(0, batch.shape[0]) + + tensor_index = 0 + assert tuple(transform_input_result.dims[tensor_index]) == batch.shape + assert transform_input_result.dtypes[tensor_index] == str(batch.dtype) + mem_alloc = MemoryAlloc.attach(transform_input_result.transformed[tensor_index]) + itemsize = batch.itemsize + tensor = np.frombuffer( + mem_alloc.get_memview()[ + 0 : np.prod(transform_input_result.dims[tensor_index]) * itemsize + ], + dtype=transform_input_result.dtypes[tensor_index], + ).reshape(transform_input_result.dims[tensor_index]) + + np.testing.assert_allclose(tensor, sample_request.raw_inputs[tensor_index]) + + mem_pool.destroy() + + +def test_execute(mlutils) -> None: + + onnx_serialized, inputs, outputs = create_onnx_model() + + providers = ['CPUExecutionProvider'] + session = InferenceSession(onnx_serialized, providers=providers) + load_model_result = LoadModelResult( + session, inputs=inputs, outputs=outputs + ) + + fetch_input_result = FetchInputResult( + sample_request.raw_inputs, sample_request.input_meta + ) + + request_batch = get_request_batch_from_request(sample_request, fetch_input_result) + + mem_pool = MemoryPool.attach(dragon_gs_pool.create(1024**2).sdesc) + + transform_result = worker.transform_input( + request_batch, [fetch_input_result], mem_pool + ) + + execute_result = worker.execute( + request_batch, + load_model_result, + transform_result, + mlutils.get_test_device().lower(), + ) + + assert all(result.shape == (10, 1) for result in execute_result.predictions) + + mem_pool.destroy() + + +def test_transform_output(mlutils): + tensors = [np.zeros((10, 1))] + execute_result = ExecuteResult(tensors, [slice(0, 10)]) + + transformed_output = worker.transform_output(sample_request_batch, execute_result) + + assert transformed_output[0].outputs == [item.tobytes() for item in tensors] + assert transformed_output[0].shape == None + assert transformed_output[0].order == "c" + assert transformed_output[0].dtype == "float32" From dccc07d711c6b62861d928c04afbdf22ab07ceae Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 4 Oct 2024 10:45:30 -0500 Subject: [PATCH 49/60] Add onnx mock app --- ex/high_throughput_inference/mli_driver.py | 17 +- ex/high_throughput_inference/mock_app_onnx.py | 215 ++++++++++++++++++ .../mock_app_tensorflow.py | 2 +- 3 files changed, 227 insertions(+), 7 deletions(-) create mode 100644 ex/high_throughput_inference/mock_app_onnx.py diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index c66bda30a6..2cb2852d52 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -9,15 +9,13 @@ import cloudpickle from smartsim import Experiment -from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker -from smartsim._core.mli.infrastructure.worker.tensorflow_worker import TensorFlowWorker from smartsim.settings import DragonRunSettings from smartsim.status import TERMINAL_STATUSES parser = argparse.ArgumentParser("Mock application") parser.add_argument("--log_max_batchsize", default=8, type=int) parser.add_argument("--num_nodes_app", default=1, type=int) -parser.add_argument("--toolkit", default="torch", choices=["torch","tensorflow"], type=str) +parser.add_argument("--toolkit", default="torch", choices=["torch","tensorflow","onnx"], type=str) args = parser.parse_args() DEVICE = "gpu" @@ -31,9 +29,9 @@ if args.toolkit == "torch": # keeping old name for backward compatibility app_script_name = os.path.join(filedir, "mock_app.py") + model_name = os.path.join(filedir, f"resnet50.{DEVICE}.pt") else: app_script_name = os.path.join(filedir, f"mock_app_{args.toolkit}.py") -model_name = os.path.join(filedir, f"resnet50.{DEVICE}.pt") transport: t.Literal["hsta", "tcp"] = "hsta" @@ -55,9 +53,15 @@ exp = Experiment("MLI_benchmark", launcher="dragon", exp_path=exp_path) if args.toolkit == "torch": + from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii") elif args.toolkit == "tensorflow": + from smartsim._core.mli.infrastructure.worker.tensorflow_worker import TensorFlowWorker worker_str = base64.b64encode(cloudpickle.dumps(TensorFlowWorker)).decode("ascii") +elif args.toolkit == "onnx": + from smartsim._core.mli.infrastructure.worker.onnx_worker import ONNXWorker + worker_str = base64.b64encode(cloudpickle.dumps(ONNXWorker)).decode("ascii") + worker_manager_rs: DragonRunSettings = exp.create_run_settings( @@ -81,7 +85,7 @@ worker_manager_rs.set_cpu_affinity(aff) worker_manager_rs.set_gpu_affinity([0, 1, 2, 3]) -worker_manager_rs.set_hostlist(["pinoak0043"]) +worker_manager_rs.set_hostlist(["pinoak0037"]) worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) @@ -99,7 +103,8 @@ app_rs.set_nodes(NUM_NODES_APP) app = exp.create_model("app", run_settings=app_rs) -app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) +if args.toolkit == "torch": + app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) exp.generate(worker_manager, app, overwrite=True) exp.start(worker_manager, block=False) diff --git a/ex/high_throughput_inference/mock_app_onnx.py b/ex/high_throughput_inference/mock_app_onnx.py new file mode 100644 index 0000000000..dfa93937ac --- /dev/null +++ b/ex/high_throughput_inference/mock_app_onnx.py @@ -0,0 +1,215 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# isort: off +import dragon +from dragon import fli +from dragon.channels import Channel +import dragon.channels +from dragon.data.ddict.ddict import DDict +from dragon.globalservices.api_setup import connect_to_infrastructure +from dragon.utils import b64decode, b64encode + +# isort: on + +import argparse +import io +import os +import time +import typing as t +import warnings + +from mpi4py import MPI +import numpy +from numpy.polynomial import Polynomial + +import onnx +from sklearn.preprocessing import PolynomialFeatures +from sklearn.linear_model import LinearRegression + +from skl2onnx import to_onnx + +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( + DragonFeatureStore, +) +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel, create_local +from smartsim._core.mli.message_handler import MessageHandler +from smartsim._core.utils.timings import PerfTimer +from smartsim.log import get_logger + +logger = get_logger("App") + + +class ProtoClient: + def __init__(self, timing_on: bool): + self._comm = MPI.COMM_WORLD + self._rank = self._comm.Get_rank() + connect_to_infrastructure() + ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] + self._ddict = DDict.attach(ddict_str) + self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor + to_worker_fli_str = None + while to_worker_fli_str is None: + try: + to_worker_fli_str = self._ddict["to_worker_fli"] + self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str) + except KeyError: + time.sleep(1) + self._from_worker_ch = DragonCommChannel(Channel.make_process_local()) + self._from_worker_ch_serialized = self._from_worker_ch.descriptor_string + self._to_worker_ch = Channel.make_process_local() + + self.perf_timer: PerfTimer = PerfTimer( + debug=False, timing_on=timing_on, prefix=f"a{self._rank}_" + ) + self._num_its: int = 0 + + def run_model(self, model: t.Union[bytes, str], batch: numpy.typing.ArrayLike): + tensors = [batch] + self.perf_timer.start_timings("batch_size", batch.shape[0]) + built_tensor_desc = MessageHandler.build_tensor_descriptor( + "c", "float32", list(batch.shape) + ) + self.perf_timer.measure_time("build_tensor_descriptor") + if isinstance(model, str): + model_arg = MessageHandler.build_model_key(model, self._backbone_descriptor) + else: + model_arg = MessageHandler.build_model(model, "lin_reg", "1.0") + request = MessageHandler.build_request( + reply_channel=self._from_worker_ch_serialized, + model=model_arg, + inputs=[built_tensor_desc], + outputs=[], + output_descriptors=[], + custom_attributes=None, + ) + self.perf_timer.measure_time("build_request") + request_bytes = MessageHandler.serialize_request(request) + self.perf_timer.measure_time("serialize_request") + with self._to_worker_fli.sendh( + timeout=None, stream_channel=self._to_worker_ch + ) as to_sendh: + to_sendh.send_bytes(request_bytes) + self.perf_timer.measure_time("send_request") + for tensor in tensors: + to_sendh.send_bytes(tensor.tobytes()) + self.perf_timer.measure_time("send_tensors") + resp = self._from_worker_ch.recv(timeout=None) + self.perf_timer.measure_time("receive_response") + response = MessageHandler.deserialize_response(resp[0]) + self.perf_timer.measure_time("deserialize_response") + # list of data blobs? recv depending on the len(response.result.descriptors)? + if len(resp) > 1: + data_blob = resp[1] + else: + data_blob: bytes = self._from_worker_ch.recv(timeout=None)[0] + self.perf_timer.measure_time("receive_tensor") + result = numpy.frombuffer( + data_blob, + dtype=str(response.result.descriptors[0].dataType), + ) + + self.perf_timer.measure_time("deserialize_tensor") + + self.perf_timer.end_timings() + self._num_its += 1 + self._comm.Barrier() + return result + + def set_model(self, key: str, model: bytes): + self._ddict[key] = model + + +class LinRegWrapper: + def __init__( + self, + name: str, + model: onnx.onnx_ml_pb2.ModelProto, + ): + self._get_onnx_model(model) + self._name = name + self._poly = PolynomialFeatures + + def _get_onnx_model(self, model: onnx.onnx_ml_pb2.ModelProto): + self._serialized_model = model.SerializeToString() + + def get_batch(self, batch_size: int = 32): + x = numpy.random.randn(batch_size, 1).astype(numpy.float32) + return poly.fit_transform(x.reshape(-1,1)) + + @property + def model(self): + return self._serialized_model + + @property + def name(self): + return self._name + + +def log(msg: str, rank: int) -> None: + if rank == 0: + logger.info(msg) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser("Mock application") + parser.add_argument("--device", default="cpu", type=str) + parser.add_argument("--log_max_batchsize", default=8, type=int) + args = parser.parse_args() + + X = numpy.linspace(0, 10, 10).astype(numpy.float32) + poly = PolynomialFeatures(degree=2, include_bias=False) + p = Polynomial([1.4, -10, 4]) + poly_features = poly.fit_transform(X.reshape(-1, 1)) + poly_reg_model = LinearRegression() + poly_reg_model.fit(poly_features, p(X)) + + onnx_model = to_onnx(poly_reg_model, poly_features, target_opset=13) + + linreg = LinRegWrapper("LinReg", onnx_model) + + client = ProtoClient(timing_on=True) + + if client._rank == 0: + client.set_model(linreg.name, linreg.model) + + MPI.COMM_WORLD.Barrier() + + TOTAL_ITERATIONS = 100 + + for log2_bsize in range(args.log_max_batchsize, args.log_max_batchsize + 1): + b_size: int = 2**log2_bsize + log(f"Batch size: {b_size}", client._rank) + for iteration_number in range(TOTAL_ITERATIONS): + sample_batch = linreg.get_batch(b_size) + remote_result = client.run_model(linreg.name, sample_batch) + log( + f"Completed iteration: {iteration_number} in {client.perf_timer.get_last('total_time')} seconds", + client._rank, + ) + + client.perf_timer.print_timings(to_file=True, to_stdout=client._rank == 0) diff --git a/ex/high_throughput_inference/mock_app_tensorflow.py b/ex/high_throughput_inference/mock_app_tensorflow.py index b7fa104e20..0ecd5bb17d 100644 --- a/ex/high_throughput_inference/mock_app_tensorflow.py +++ b/ex/high_throughput_inference/mock_app_tensorflow.py @@ -42,9 +42,9 @@ import typing as t import warnings +from mpi4py import MPI import numpy import tensorflow as tf -from mpi4py import MPI from tensorflow.python.framework.convert_to_constants import ( convert_variables_to_constants_v2_as_graph, ) From a590a36cb71987c4a91f6b00e8ccbe29c53c9ca0 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 4 Oct 2024 10:54:42 -0500 Subject: [PATCH 50/60] Style --- .../mli/infrastructure/worker/onnx_worker.py | 20 +++++++++++-------- tests/dragon/test_onnx_worker.py | 20 +++++++++---------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/worker/onnx_worker.py b/smartsim/_core/mli/infrastructure/worker/onnx_worker.py index 47cb11d936..42b2c833c2 100644 --- a/smartsim/_core/mli/infrastructure/worker/onnx_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/onnx_worker.py @@ -37,8 +37,6 @@ # pylint: disable=import-error from dragon.managed_memory import MemoryAlloc, MemoryPool -# pylint: enable=import-error - from .....error import SmartSimError from .....log import get_logger from ...mli_schemas.tensor import tensor_capnp @@ -53,6 +51,8 @@ TransformOutputResult, ) +# pylint: enable=import-error + logger = get_logger(__name__) @@ -83,18 +83,21 @@ def load_model( try: providers = [] + provider_options = [] if "gpu" in device.lower(): device_split = device.split(":") if len(device_split) > 1: - provider_options = {"device_id": device_split[-1]} + provider_options.append({"device_id": device_split[-1]}) + else: + provider_options.append({}) if "ROCR_VISIBLE_DEVICES" in os.environ: - providers = [("ROCMExecutionProvider", provider_options)] + providers = ["ROCMExecutionProvider"] else: - providers = [("CUDAExecutionProvider", provider_options)] + providers = ["CUDAExecutionProvider"] # Fallback - providers.append(("CPUExecutionProvider", {})) - + providers.append("CPUExecutionProvider") + provider_options.append({}) except Exception as e: raise RuntimeError( "Failed to load and evaluate the model: " @@ -105,8 +108,9 @@ def load_model( output_tensors = [n.name for n in onnx_deserialized.graph.output] input_layers = [n.name for n in onnx_deserialized.graph.input] + print(device, providers, provider_options) session = InferenceSession( - model_bytes, providers=providers + model_bytes, providers=providers, provider_options=provider_options ) result = LoadModelResult( session, diff --git a/tests/dragon/test_onnx_worker.py b/tests/dragon/test_onnx_worker.py index bd611194d2..f4103741dd 100644 --- a/tests/dragon/test_onnx_worker.py +++ b/tests/dragon/test_onnx_worker.py @@ -33,17 +33,17 @@ onnx = pytest.importorskip("onnx") from onnx import load_model_from_string + pytest.importorskip("onnxruntime") from onnxruntime import InferenceSession -dragon = pytest.importorskip("dragon") -from sklearn.preprocessing import PolynomialFeatures -from sklearn.linear_model import LinearRegression - -from skl2onnx import to_onnx +dragon = pytest.importorskip("dragon") import dragon.globalservices.pool as dragon_gs_pool from dragon.managed_memory import MemoryAlloc, MemoryPool +from skl2onnx import to_onnx +from sklearn.linear_model import LinearRegression +from sklearn.preprocessing import PolynomialFeatures from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey from smartsim._core.mli.infrastructure.worker.onnx_worker import ONNXWorker @@ -67,15 +67,18 @@ def get_X() -> npt.ArrayLike: return np.linspace(0, 10, 10).astype(np.float32) + def get_poly_features() -> npt.ArrayLike: poly = PolynomialFeatures(degree=2, include_bias=False) return poly.fit_transform(get_X().reshape(-1, 1)) + def get_Y() -> npt.ArrayLike: p = np.polynomial.Polynomial([1.4, -10, 4]) return p(get_X()) + def create_onnx_model(): poly_features = get_poly_features() @@ -91,7 +94,6 @@ def create_onnx_model(): return onnx_serialized, input_names, output_names - def get_request() -> InferenceRequest: tensors = [get_poly_features()] @@ -173,11 +175,9 @@ def test_execute(mlutils) -> None: onnx_serialized, inputs, outputs = create_onnx_model() - providers = ['CPUExecutionProvider'] + providers = ["CPUExecutionProvider"] session = InferenceSession(onnx_serialized, providers=providers) - load_model_result = LoadModelResult( - session, inputs=inputs, outputs=outputs - ) + load_model_result = LoadModelResult(session, inputs=inputs, outputs=outputs) fetch_input_result = FetchInputResult( sample_request.raw_inputs, sample_request.input_meta From bf324d2918ebf4131b703ba8d83e63f7afa43346 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 4 Oct 2024 12:42:46 -0500 Subject: [PATCH 51/60] Add optional compile step for Torch model --- ex/high_throughput_inference/mli_driver.py | 2 -- smartsim/_core/mli/infrastructure/worker/onnx_worker.py | 1 - smartsim/_core/mli/infrastructure/worker/torch_worker.py | 6 ++++++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 2cb2852d52..239db3107c 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -62,8 +62,6 @@ from smartsim._core.mli.infrastructure.worker.onnx_worker import ONNXWorker worker_str = base64.b64encode(cloudpickle.dumps(ONNXWorker)).decode("ascii") - - worker_manager_rs: DragonRunSettings = exp.create_run_settings( sys.executable, [ diff --git a/smartsim/_core/mli/infrastructure/worker/onnx_worker.py b/smartsim/_core/mli/infrastructure/worker/onnx_worker.py index 42b2c833c2..b22917814c 100644 --- a/smartsim/_core/mli/infrastructure/worker/onnx_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/onnx_worker.py @@ -108,7 +108,6 @@ def load_model( output_tensors = [n.name for n in onnx_deserialized.graph.output] input_layers = [n.name for n in onnx_deserialized.graph.input] - print(device, providers, provider_options) session = InferenceSession( model_bytes, providers=providers, provider_options=provider_options ) diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 5fd4bc4736..8eec7bcc11 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -92,6 +92,12 @@ def load_model( "Failed to load and evaluate the model: " f"Model key {batch.model_id.key}, Device {device}" ) from e + try: + model = torch.compile(model, dynamic=True) + except Exception as exc: + logger.info("Could not compile Torch model, original exception: ") + logger.info(exc) + pass result = LoadModelResult(model) return result From ca01cb1921e30b06209a92f64446c3d49175a54c Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Thu, 10 Oct 2024 13:58:55 -0400 Subject: [PATCH 52/60] Add integration of dragon-based event broadcasting (#710) This PR integrates event publishers and consumers in `ProtoClient` and `DragonBackend` [ committed by @ankona] [ reviewed by @al-rigazzi @mellis13 @amandarichardsonn ] --- conftest.py | 2 +- doc/changelog.md | 2 +- ex/high_throughput_inference/mock_app.py | 125 ++--- .../standalone_worker_manager.py | 61 ++- smartsim/_core/_cli/scripts/dragon_install.py | 12 +- smartsim/_core/entrypoints/service.py | 85 +++- .../_core/launcher/dragon/dragonBackend.py | 155 ++++-- .../_core/launcher/dragon/dragonConnector.py | 85 +++- smartsim/_core/mli/client/__init__.py | 0 smartsim/_core/mli/client/protoclient.py | 348 +++++++++++++ smartsim/_core/mli/comm/channel/channel.py | 25 +- .../_core/mli/comm/channel/dragon_channel.py | 115 +---- smartsim/_core/mli/comm/channel/dragon_fli.py | 85 ++-- .../_core/mli/comm/channel/dragon_util.py | 131 +++++ .../_core/mli/infrastructure/comm/__init__.py | 0 .../mli/infrastructure/comm/broadcaster.py | 239 +++++++++ .../_core/mli/infrastructure/comm/consumer.py | 281 +++++++++++ .../_core/mli/infrastructure/comm/event.py | 162 ++++++ .../_core/mli/infrastructure/comm/producer.py | 44 ++ .../infrastructure/control/error_handling.py | 2 +- .../mli/infrastructure/control/listener.py | 352 +++++++++++++ .../control/request_dispatcher.py | 25 +- .../mli/infrastructure/environment_loader.py | 15 +- .../storage/backbone_feature_store.py | 470 ++++++------------ .../storage/dragon_feature_store.py | 39 +- .../mli/infrastructure/storage/dragon_util.py | 101 ++++ .../infrastructure/storage/feature_store.py | 34 +- .../_core/mli/infrastructure/worker/worker.py | 56 ++- smartsim/_core/mli/message_handler.py | 28 +- .../mli_schemas/data/data_references.capnp | 4 +- .../data/data_references_capnp.pyi | 4 +- .../mli/mli_schemas/request/request.capnp | 2 +- .../mli/mli_schemas/request/request_capnp.pyi | 2 +- smartsim/_core/utils/timings.py | 8 +- smartsim/log.py | 13 +- tests/dragon/channel.py | 40 +- tests/dragon/conftest.py | 129 +++++ .../test_core_machine_learning_worker.py | 54 +- tests/dragon/test_device_manager.py | 15 +- tests/dragon/test_dragon_backend.py | 307 ++++++++++++ tests/dragon/test_dragon_ddict_utils.py | 117 +++++ tests/dragon/test_environment_loader.py | 62 ++- tests/dragon/test_error_handling.py | 121 +++-- tests/dragon/test_event_consumer.py | 386 ++++++++++++++ tests/dragon/test_featurestore.py | 327 ++++++++++++ tests/dragon/test_featurestore_base.py | 271 +++++++--- tests/dragon/test_featurestore_integration.py | 224 ++++----- tests/dragon/test_inference_reply.py | 6 +- tests/dragon/test_inference_request.py | 6 +- tests/dragon/test_protoclient.py | 313 ++++++++++++ tests/dragon/test_reply_building.py | 1 - tests/dragon/test_request_dispatcher.py | 238 +++------ tests/dragon/test_torch_worker.py | 4 +- tests/dragon/test_worker_manager.py | 222 ++++++--- tests/dragon/utils/channel.py | 42 +- tests/dragon/utils/msg_pump.py | 225 +++++++++ tests/mli/channel.py | 40 +- tests/mli/test_default_torch_worker.py | 206 -------- tests/mli/test_service.py | 109 +++- tests/test_dragon_comm_utils.py | 257 ++++++++++ tests/test_dragon_installer.py | 18 +- tests/test_dragon_launcher.py | 19 + .../test_build_model_key.py | 2 +- tests/test_message_handler/test_request.py | 28 +- 64 files changed, 5328 insertions(+), 1573 deletions(-) create mode 100644 smartsim/_core/mli/client/__init__.py create mode 100644 smartsim/_core/mli/client/protoclient.py create mode 100644 smartsim/_core/mli/comm/channel/dragon_util.py create mode 100644 smartsim/_core/mli/infrastructure/comm/__init__.py create mode 100644 smartsim/_core/mli/infrastructure/comm/broadcaster.py create mode 100644 smartsim/_core/mli/infrastructure/comm/consumer.py create mode 100644 smartsim/_core/mli/infrastructure/comm/event.py create mode 100644 smartsim/_core/mli/infrastructure/comm/producer.py create mode 100644 smartsim/_core/mli/infrastructure/control/listener.py create mode 100644 smartsim/_core/mli/infrastructure/storage/dragon_util.py create mode 100644 tests/dragon/conftest.py create mode 100644 tests/dragon/test_dragon_backend.py create mode 100644 tests/dragon/test_dragon_ddict_utils.py create mode 100644 tests/dragon/test_event_consumer.py create mode 100644 tests/dragon/test_featurestore.py create mode 100644 tests/dragon/test_protoclient.py create mode 100644 tests/dragon/utils/msg_pump.py delete mode 100644 tests/mli/test_default_torch_worker.py create mode 100644 tests/test_dragon_comm_utils.py diff --git a/conftest.py b/conftest.py index 991c0d17b6..54a47f9e23 100644 --- a/conftest.py +++ b/conftest.py @@ -93,6 +93,7 @@ test_hostlist = None has_aprun = shutil.which("aprun") is not None + def get_account() -> str: return test_account @@ -227,7 +228,6 @@ def kill_all_test_spawned_processes() -> None: print("Not all processes were killed after test") - def get_hostlist() -> t.Optional[t.List[str]]: global test_hostlist if not test_hostlist: diff --git a/doc/changelog.md b/doc/changelog.md index 7d08c9376f..b0e326d1f7 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,12 +13,12 @@ Jump to: Description +- Implement asynchronous notifications for shared data - Quick bug fix in _validate - Add helper methods to MLI classes - Update error handling for consistency - Parameterize installation of dragon package with `smart build` - Update docstrings -- Implement asynchronous notifications for shared data - Filenames conform to snake case - Update SmartSim environment variables using new naming convention - Refactor `exception_handler` diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index dcc52296ef..c3b3eaaf4c 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -37,18 +37,10 @@ import argparse import io -import numpy -import os -import time + import torch -from mpi4py import MPI -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( - DragonFeatureStore, -) -from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger -from smartsim._core.utils.timings import PerfTimer torch.set_num_interop_threads(16) torch.set_num_threads(1) @@ -56,83 +48,24 @@ logger = get_logger("App") logger.info("Started app") -CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False +from collections import OrderedDict -class ProtoClient: - def __init__(self, timing_on: bool): - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - connect_to_infrastructure() - ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] - self._ddict = DDict.attach(ddict_str) - self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor - to_worker_fli_str = None - while to_worker_fli_str is None: - try: - to_worker_fli_str = self._ddict["to_worker_fli"] - self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str) - except KeyError: - time.sleep(1) - self._from_worker_ch = Channel.make_process_local() - self._from_worker_ch_serialized = self._from_worker_ch.serialize() - self._to_worker_ch = Channel.make_process_local() - - self.perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{rank}_") - - def run_model(self, model: bytes | str, batch: torch.Tensor): - tensors = [batch.numpy()] - self.perf_timer.start_timings("batch_size", batch.shape[0]) - built_tensor_desc = MessageHandler.build_tensor_descriptor( - "c", "float32", list(batch.shape) - ) - self.perf_timer.measure_time("build_tensor_descriptor") - if isinstance(model, str): - model_arg = MessageHandler.build_model_key(model, self._backbone_descriptor) - else: - model_arg = MessageHandler.build_model(model, "resnet-50", "1.0") - request = MessageHandler.build_request( - reply_channel=self._from_worker_ch_serialized, - model=model_arg, - inputs=[built_tensor_desc], - outputs=[], - output_descriptors=[], - custom_attributes=None, - ) - self.perf_timer.measure_time("build_request") - request_bytes = MessageHandler.serialize_request(request) - self.perf_timer.measure_time("serialize_request") - with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: - to_sendh.send_bytes(request_bytes) - self.perf_timer.measure_time("send_request") - for tensor in tensors: - to_sendh.send_bytes(tensor.tobytes()) #TODO NOT FAST ENOUGH!!! - self.perf_timer.measure_time("send_tensors") - with self._from_worker_ch.recvh(timeout=None) as from_recvh: - resp = from_recvh.recv_bytes(timeout=None) - self.perf_timer.measure_time("receive_response") - response = MessageHandler.deserialize_response(resp) - self.perf_timer.measure_time("deserialize_response") - # list of data blobs? recv depending on the len(response.result.descriptors)? - data_blob: bytes = from_recvh.recv_bytes(timeout=None) - self.perf_timer.measure_time("receive_tensor") - result = torch.from_numpy( - numpy.frombuffer( - data_blob, - dtype=str(response.result.descriptors[0].dataType), - ) - ) - self.perf_timer.measure_time("deserialize_tensor") +from smartsim.log import get_logger, log_to_file +from smartsim._core.mli.client.protoclient import ProtoClient - self.perf_timer.end_timings() - return result +logger = get_logger("App") - def set_model(self, key: str, model: bytes): - self._ddict[key] = model +CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False class ResNetWrapper: + """Wrapper around a pre-rained ResNet model.""" def __init__(self, name: str, model: str): + """Initialize the instance. + + :param name: The name to use for the model + :param model: The path to the pre-trained PyTorch model""" self._model = torch.jit.load(model) self._name = name buffer = io.BytesIO() @@ -141,16 +74,28 @@ def __init__(self, name: str, model: str): self._serialized_model = buffer.getvalue() def get_batch(self, batch_size: int = 32): + """Create a random batch of data with the correct dimensions to + invoke a ResNet model. + + :param batch_size: The desired number of samples to produce + :returns: A PyTorch tensor""" return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) @property - def model(self): + def model(self) -> bytes: + """The content of a model file. + + :returns: The model bytes""" return self._serialized_model @property - def name(self): + def name(self) -> str: + """The name applied to the model. + + :returns: The name""" return self._name + if __name__ == "__main__": parser = argparse.ArgumentParser("Mock application") @@ -166,24 +111,32 @@ def name(self): if CHECK_RESULTS_AND_MAKE_ALL_SLOWER: # TODO: adapt to non-Nvidia devices torch_device = args.device.replace("gpu", "cuda") - pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to(torch_device) + pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to( + torch_device + ) TOTAL_ITERATIONS = 100 - for log2_bsize in range(args.log_max_batchsize+1): + for log2_bsize in range(args.log_max_batchsize + 1): b_size: int = 2**log2_bsize logger.info(f"Batch size: {b_size}") - for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)): + for iteration_number in range(TOTAL_ITERATIONS + int(b_size == 1)): logger.info(f"Iteration: {iteration_number}") sample_batch = resnet.get_batch(b_size) remote_result = client.run_model(resnet.name, sample_batch) logger.info(client.perf_timer.get_last("total_time")) if CHECK_RESULTS_AND_MAKE_ALL_SLOWER: local_res = pt_model(sample_batch.to(torch_device)) - err_norm = torch.linalg.vector_norm(torch.flatten(remote_result).to(torch_device)-torch.flatten(local_res), ord=1).cpu() + err_norm = torch.linalg.vector_norm( + torch.flatten(remote_result).to(torch_device) + - torch.flatten(local_res), + ord=1, + ).cpu() res_norm = torch.linalg.vector_norm(remote_result, ord=1).item() local_res_norm = torch.linalg.vector_norm(local_res, ord=1).item() - logger.info(f"Avg norm of error {err_norm.item()/b_size} compared to result norm of {res_norm/b_size}:{local_res_norm/b_size}") + logger.info( + f"Avg norm of error {err_norm.item()/b_size} compared to result norm of {res_norm/b_size}:{local_res_norm/b_size}" + ) torch.cuda.synchronize() - client.perf_timer.print_timings(to_file=True) \ No newline at end of file + client.perf_timer.print_timings(to_file=True) diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py index feb1af1aee..b4527bc5d2 100644 --- a/ex/high_throughput_inference/standalone_worker_manager.py +++ b/ex/high_throughput_inference/standalone_worker_manager.py @@ -37,6 +37,7 @@ from dragon.globalservices.api_setup import connect_to_infrastructure from dragon.managed_memory import MemoryPool from dragon.utils import b64decode, b64encode + # pylint enable=import-error # isort: off @@ -46,33 +47,27 @@ import base64 import multiprocessing as mp import os -import pickle import socket -import sys import time import typing as t import cloudpickle -import optparse -import os from smartsim._core.entrypoints.service import Service -from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( - DragonFeatureStore, -) +from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.control.request_dispatcher import ( RequestDispatcher, ) from smartsim._core.mli.infrastructure.control.worker_manager import WorkerManager from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase - from smartsim.log import get_logger logger = get_logger("Worker Manager Entry Point") @@ -85,7 +80,6 @@ logger.info(f"CPUS: {os.cpu_count()}") - def service_as_dragon_proc( service: Service, cpu_affinity: list[int], gpu_affinity: list[int] ) -> dragon_process.Process: @@ -108,8 +102,6 @@ def service_as_dragon_proc( ) - - if __name__ == "__main__": parser = argparse.ArgumentParser("Worker Manager") parser.add_argument( @@ -143,27 +135,26 @@ def service_as_dragon_proc( args = parser.parse_args() connect_to_infrastructure() - ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] - ddict = DDict.attach(ddict_str) + ddict_str = os.environ[BackboneFeatureStore.MLI_BACKBONE] + + backbone = BackboneFeatureStore.from_descriptor(ddict_str) - to_worker_channel = Channel.make_process_local() + to_worker_channel = create_local() to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - to_worker_fli_serialized = to_worker_fli.serialize() - ddict["to_worker_fli"] = to_worker_fli_serialized + to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli) + + backbone.worker_queue = to_worker_fli_comm_ch.descriptor + + os.environ[BackboneFeatureStore.MLI_WORKER_QUEUE] = to_worker_fli_comm_ch.descriptor + os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone.descriptor arg_worker_type = cloudpickle.loads( base64.b64decode(args.worker_class.encode("ascii")) ) - dfs = DragonFeatureStore(ddict) - comm_channel = DragonFLIChannel(to_worker_fli_serialized) - - descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") - os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor - config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=DragonCommChannel, + callback_factory=DragonCommChannel.from_descriptor, queue_factory=DragonFLIChannel.from_descriptor, ) @@ -178,7 +169,7 @@ def service_as_dragon_proc( worker_device = args.device for wm_idx in range(args.num_workers): - worker_manager = WorkerManager( + worker_manager = WorkerManager( config_loader=config_loader, worker_type=arg_worker_type, as_service=True, @@ -196,21 +187,25 @@ def service_as_dragon_proc( # the GPU-to-CPU mapping is taken from the nvidia-smi tool # TODO can this be computed on the fly? gpu_to_cpu_aff: dict[int, list[int]] = {} - gpu_to_cpu_aff[0] = list(range(48,64)) + list(range(112,128)) - gpu_to_cpu_aff[1] = list(range(32,48)) + list(range(96,112)) - gpu_to_cpu_aff[2] = list(range(16,32)) + list(range(80,96)) - gpu_to_cpu_aff[3] = list(range(0,16)) + list(range(64,80)) + gpu_to_cpu_aff[0] = list(range(48, 64)) + list(range(112, 128)) + gpu_to_cpu_aff[1] = list(range(32, 48)) + list(range(96, 112)) + gpu_to_cpu_aff[2] = list(range(16, 32)) + list(range(80, 96)) + gpu_to_cpu_aff[3] = list(range(0, 16)) + list(range(64, 80)) worker_manager_procs = [] for worker_idx in range(args.num_workers): wm_cpus = len(gpu_to_cpu_aff[worker_idx]) - 4 wm_affinity = gpu_to_cpu_aff[worker_idx][:wm_cpus] disp_affinity.extend(gpu_to_cpu_aff[worker_idx][wm_cpus:]) - worker_manager_procs.append(service_as_dragon_proc( + worker_manager_procs.append( + service_as_dragon_proc( worker_manager, cpu_affinity=wm_affinity, gpu_affinity=[worker_idx] - )) + ) + ) - dispatcher_proc = service_as_dragon_proc(dispatcher, cpu_affinity=disp_affinity, gpu_affinity=[]) + dispatcher_proc = service_as_dragon_proc( + dispatcher, cpu_affinity=disp_affinity, gpu_affinity=[] + ) # TODO: use ProcessGroup and restart=True? all_procs = [dispatcher_proc, *worker_manager_procs] diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index 4fd0be3004..b6666f7c8e 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -57,7 +57,7 @@ def __init__( def _check(self) -> None: """Perform validation of this instance - :raises: ValueError if any value fails validation""" + :raises ValueError: if any value fails validation""" if not self.repo_name or len(self.repo_name.split("/")) != 2: raise ValueError( f"Invalid dragon repository name. Example: `dragonhpc/dragon`" @@ -95,13 +95,13 @@ def get_auth_token(request: DragonInstallRequest) -> t.Optional[Token]: def create_dotenv(dragon_root_dir: pathlib.Path, dragon_version: str) -> None: """Create a .env file with required environment variables for the Dragon runtime""" dragon_root = str(dragon_root_dir) - dragon_inc_dir = str(dragon_root_dir / "include") - dragon_lib_dir = str(dragon_root_dir / "lib") - dragon_bin_dir = str(dragon_root_dir / "bin") + dragon_inc_dir = dragon_root + "/include" + dragon_lib_dir = dragon_root + "/lib" + dragon_bin_dir = dragon_root + "/bin" dragon_vars = { "DRAGON_BASE_DIR": dragon_root, - "DRAGON_ROOT_DIR": dragon_root, # note: same as base_dir + "DRAGON_ROOT_DIR": dragon_root, "DRAGON_INCLUDE_DIR": dragon_inc_dir, "DRAGON_LIB_DIR": dragon_lib_dir, "DRAGON_VERSION": dragon_version, @@ -286,7 +286,7 @@ def retrieve_asset( :param request: details of a request for the installation of the dragon package :param asset: GitHub release asset to retrieve :returns: path to the directory containing the extracted release asset - :raises: SmartSimCLIActionCancelled if the asset cannot be downloaded or extracted + :raises SmartSimCLIActionCancelled: if the asset cannot be downloaded or extracted """ download_dir = request.working_dir / str(asset.id) diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py index 6b4ef74b67..719c2a60fe 100644 --- a/smartsim/_core/entrypoints/service.py +++ b/smartsim/_core/entrypoints/service.py @@ -35,26 +35,50 @@ class Service(ABC): - """Base contract for standalone entrypoint scripts. Defines API for entrypoint - behaviors (event loop, automatic shutdown, cooldown) as well as simple - hooks for status changes""" + """Core API for standalone entrypoint scripts. Makes use of overridable hook + methods to modify behaviors (event loop, automatic shutdown, cooldown) as + well as simple hooks for status changes""" def __init__( - self, as_service: bool = False, cooldown: int = 0, loop_delay: int = 0 + self, + as_service: bool = False, + cooldown: float = 0, + loop_delay: float = 0, + health_check_frequency: float = 0, ) -> None: - """Initialize the ServiceHost - :param as_service: Determines if the host will run until shutdown criteria - are met or as a run-once instance - :param cooldown: Period of time to allow service to run before automatic - shutdown, in seconds. A non-zero, positive integer. - :param loop_delay: delay between iterations of the event loop""" + """Initialize the Service + + :param as_service: Determines lifetime of the service. When `True`, calling + execute on the service will run continuously until shutdown criteria are met. + Otherwise, `execute` performs a single pass through the service lifecycle and + automatically exits (regardless of the result of `_can_shutdown`). + :param cooldown: Period of time (in seconds) to allow the service to run + after a shutdown is permitted. Enables the service to avoid restarting if + new work is discovered. A value of 0 disables the cooldown. + :param loop_delay: Duration (in seconds) of a forced delay between + iterations of the event loop + :param health_check_frequency: Time (in seconds) between calls to a + health check handler. A value of 0 triggers the health check on every + iteration. + """ self._as_service = as_service - """If the service should run until shutdown function returns True""" + """Determines lifetime of the service. When `True`, calling + `execute` on the service will run continuously until shutdown criteria are met. + Otherwise, `execute` performs a single pass through the service lifecycle and + automatically exits (regardless of the result of `_can_shutdown`).""" self._cooldown = abs(cooldown) - """Duration of a cooldown period between requests to the service - before shutdown""" + """Period of time (in seconds) to allow the service to run + after a shutdown is permitted. Enables the service to avoid restarting if + new work is discovered. A value of 0 disables the cooldown.""" self._loop_delay = abs(loop_delay) - """Forced delay between iterations of the event loop""" + """Duration (in seconds) of a forced delay between + iterations of the event loop""" + self._health_check_frequency = health_check_frequency + """Time (in seconds) between calls to a + health check handler. A value of 0 triggers the health check on every + iteration.""" + self._last_health_check = time.time() + """The timestamp of the latest health check""" @abstractmethod def _on_iteration(self) -> None: @@ -68,7 +92,7 @@ def _can_shutdown(self) -> bool: def _on_start(self) -> None: """Empty hook method for use by subclasses. Called on initial entry into - ServiceHost `execute` event loop before `_on_iteration` is invoked.""" + Service `execute` event loop before `_on_iteration` is invoked.""" logger.debug(f"Starting {self.__class__.__name__}") def _on_shutdown(self) -> None: @@ -76,6 +100,11 @@ def _on_shutdown(self) -> None: the main event loop during automatic shutdown.""" logger.debug(f"Shutting down {self.__class__.__name__}") + def _on_health_check(self) -> None: + """Empty hook method for use by subclasses. Invoked based on the + value of `self._health_check_frequency`.""" + logger.debug(f"Performing health check for {self.__class__.__name__}") + def _on_cooldown_elapsed(self) -> None: """Empty hook method for use by subclasses. Called on every event loop iteration immediately upon exceeding the cooldown period""" @@ -98,13 +127,30 @@ def execute(self) -> None: """The main event loop of a service host. Evaluates shutdown criteria and combines with a cooldown period to allow automatic service termination. Responsible for executing calls to subclass implementation of `_on_iteration`""" - self._on_start() + + try: + self._on_start() + except Exception: + logger.exception("Unable to start service.") + return running = True cooldown_start: t.Optional[datetime.datetime] = None while running: - self._on_iteration() + try: + self._on_iteration() + except Exception: + running = False + logger.exception( + "Failure in event loop resulted in service termination" + ) + + if self._health_check_frequency >= 0: + hc_elapsed = time.time() - self._last_health_check + if hc_elapsed >= self._health_check_frequency: + self._on_health_check() + self._last_health_check = time.time() # allow immediate shutdown if not set to run as a service if not self._as_service: @@ -133,4 +179,7 @@ def execute(self) -> None: self._on_delay() time.sleep(self._loop_delay) - self._on_shutdown() + try: + self._on_shutdown() + except Exception: + logger.exception("Service shutdown may not have completed.") diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 7526af14ad..5e01299141 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -26,6 +26,8 @@ import collections import functools import itertools +import os +import socket import time import typing as t from dataclasses import dataclass, field @@ -34,18 +36,26 @@ from tabulate import tabulate -# pylint: disable=import-error +# pylint: disable=import-error,C0302,R0915 # isort: off -import dragon.data.ddict.ddict as dragon_ddict + import dragon.infrastructure.connection as dragon_connection import dragon.infrastructure.policy as dragon_policy import dragon.infrastructure.process_desc as dragon_process_desc -import dragon.native.group_state as dragon_group_state + import dragon.native.process as dragon_process import dragon.native.process_group as dragon_process_group import dragon.native.machine as dragon_machine from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter +from smartsim._core.mli.infrastructure.control.listener import ( + ConsumerRegistrationListener, +) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict +from smartsim.error.errors import SmartSimError # pylint: enable=import-error # isort: on @@ -72,8 +82,8 @@ class DragonStatus(str, Enum): - ERROR = str(dragon_group_state.Error()) - RUNNING = str(dragon_group_state.Running()) + ERROR = "Error" + RUNNING = "Running" def __str__(self) -> str: return self.value @@ -90,7 +100,7 @@ class ProcessGroupInfo: return_codes: t.Optional[t.List[int]] = None """List of return codes of completed processes""" hosts: t.List[str] = field(default_factory=list) - """List of hosts on which the Process Group """ + """List of hosts on which the Process Group should be executed""" redir_workers: t.Optional[dragon_process_group.ProcessGroup] = None """Workers used to redirect stdout and stderr to file""" @@ -147,6 +157,11 @@ class DragonBackend: by threads spawned by it. """ + _DEFAULT_NUM_MGR_PER_NODE = 2 + """The default number of manager processes for each feature store node""" + _DEFAULT_MEM_PER_NODE = 512 * 1024**2 + """The default memory capacity (in bytes) to allocate for a feaure store node""" + def __init__(self, pid: int) -> None: self._pid = pid """PID of dragon executable which launched this server""" @@ -180,14 +195,12 @@ def __init__(self, pid: int) -> None: """Whether the server frontend should shut down when the backend does""" self._shutdown_initiation_time: t.Optional[float] = None """The time at which the server initiated shutdown""" - smartsim_config = get_config() - self._cooldown_period = ( - smartsim_config.telemetry_frequency * 2 + 5 - if smartsim_config.telemetry_enabled - else 5 - ) - """Time in seconds needed to server to complete shutdown""" - self._infra_ddict: t.Optional[dragon_ddict.DDict] = None + self._cooldown_period = self._initialize_cooldown() + """Time in seconds needed by the server to complete shutdown""" + self._backbone: t.Optional[BackboneFeatureStore] = None + """The backbone feature store""" + self._listener: t.Optional[dragon_process.Process] = None + """The standalone process executing the event consumer""" self._nodes: t.List["dragon_machine.Node"] = [] """Node capability information for hosts in the allocation""" @@ -201,8 +214,6 @@ def __init__(self, pid: int) -> None: """Mapping with hostnames as keys and a set of running step IDs as the value""" self._initialize_hosts() - self._view = DragonBackendView(self) - logger.debug(self._view.host_desc) self._prioritizer = NodePrioritizer(self._nodes, self._queue_lock) @property @@ -254,12 +265,11 @@ def status_message(self) -> str: :returns: a status message """ - return ( - "Dragon server backend update\n" - f"{self._view.host_table}\n{self._view.step_table}" - ) + view = DragonBackendView(self) + return "Dragon server backend update\n" f"{view.host_table}\n{view.step_table}" def _heartbeat(self) -> None: + """Update the value of the last heartbeat to the current time.""" self._last_beat = self.current_time @property @@ -539,21 +549,83 @@ def _stop_steps(self) -> None: self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED self._group_infos[step_id].return_codes = [-9] - @property - def infra_ddict(self) -> str: - """Create a Dragon distributed dictionary and return its - serialized descriptor + def _create_backbone(self) -> BackboneFeatureStore: + """ + Creates a BackboneFeatureStore if one does not exist. Updates + environment variables of this process to include the backbone + descriptor. + + :returns: The backbone feature store + """ + if self._backbone is None: + backbone_storage = create_ddict( + len(self._hosts), + self._DEFAULT_NUM_MGR_PER_NODE, + self._DEFAULT_MEM_PER_NODE, + ) + + self._backbone = BackboneFeatureStore( + backbone_storage, allow_reserved_writes=True + ) + + # put the backbone descriptor in the env vars + os.environ.update(self._backbone.get_env()) + + return self._backbone + + @staticmethod + def _initialize_cooldown() -> int: + """Load environment configuration and determine the correct cooldown + period to apply to the backend process. + + :returns: The calculated cooldown (in seconds) + """ + smartsim_config = get_config() + return ( + smartsim_config.telemetry_frequency * 2 + 5 + if smartsim_config.telemetry_enabled + else 5 + ) + + def start_event_listener( + self, cpu_affinity: list[int], gpu_affinity: list[int] + ) -> dragon_process.Process: + """Start a standalone event listener. + + :param cpu_affinity: The CPU affinity for the process + :param gpu_affinity: The GPU affinity for the process + :returns: The dragon Process managing the process + :raises SmartSimError: If the backbone is not provided """ - if self._infra_ddict is None: - logger.info("Creating DDict") - self._infra_ddict = dragon_ddict.DDict( - n_nodes=len(self._hosts), total_mem=len(self._hosts) * 1024**3 - ) # todo: parametrize - logger.info("Created DDict") - self._infra_ddict["creation"] = str(time.time()) - logger.info(self._infra_ddict["creation"]) + if self._backbone is None: + raise SmartSimError("Backbone feature store is not available") - return str(self._infra_ddict.serialize()) + service = ConsumerRegistrationListener( + self._backbone, 1.0, 2.0, as_service=True, health_check_frequency=90 + ) + + options = dragon_process_desc.ProcessOptions(make_inf_channels=True) + local_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=socket.gethostname(), + cpu_affinity=cpu_affinity, + gpu_affinity=gpu_affinity, + ) + process = dragon_process.Process( + target=service.execute, + args=[], + cwd=os.getcwd(), + env={ + **os.environ, + **self._backbone.get_env(), + }, + policy=local_policy, + options=options, + stderr=dragon_process.Popen.STDOUT, + stdout=dragon_process.Popen.STDOUT, + ) + process.start() + return process @staticmethod def create_run_policy( @@ -595,7 +667,9 @@ def create_run_policy( ) def _start_steps(self) -> None: + """Start all new steps created since the last update.""" self._heartbeat() + with self._queue_lock: started = [] for step_id, request in self._queued_steps.items(): @@ -622,7 +696,7 @@ def _start_steps(self) -> None: env={ **request.current_env, **request.env, - "_SMARTSIM_INFRA_BACKBONE": self.infra_ddict, + **(self._backbone.get_env() if self._backbone else {}), }, stdout=dragon_process.Popen.PIPE, stderr=dragon_process.Popen.PIPE, @@ -758,6 +832,9 @@ def _refresh_statuses(self) -> None: group_info.redir_workers = None def _update_shutdown_status(self) -> None: + """Query the status of running tasks and update the status + of any that have completed. + """ self._heartbeat() with self._queue_lock: self._can_shutdown |= ( @@ -771,6 +848,9 @@ def _update_shutdown_status(self) -> None: ) def _should_print_status(self) -> bool: + """Determine if status messages should be printed based off the last + update. Returns `True` to trigger prints, `False` otherwise. + """ if self.current_time - self._last_update_time > 10: self._last_update_time = self.current_time return True @@ -778,6 +858,8 @@ def _should_print_status(self) -> bool: def _update(self) -> None: """Trigger all update queries and update local state database""" + self._create_backbone() + self._stop_steps() self._start_steps() self._refresh_statuses() @@ -785,6 +867,9 @@ def _update(self) -> None: def _kill_all_running_jobs(self) -> None: with self._queue_lock: + if self._listener and self._listener.is_alive: + self._listener.kill() + for step_id, group_info in self._group_infos.items(): if group_info.status not in TERMINAL_STATUSES: self._stop_requests.append(DragonStopRequest(step_id=step_id)) @@ -872,6 +957,8 @@ def __init__(self, backend: DragonBackend) -> None: self._backend = backend """A dragon backend used to produce the view""" + logger.debug(self.host_desc) + @property def host_desc(self) -> str: hosts = self._backend.hosts diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 0cd68c24e9..1144b7764e 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -71,17 +71,23 @@ class DragonConnector: def __init__(self) -> None: self._context: zmq.Context[t.Any] = zmq.Context.instance() + """ZeroMQ context used to share configuration across requests""" self._context.setsockopt(zmq.REQ_CORRELATE, 1) self._context.setsockopt(zmq.REQ_RELAXED, 1) self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None + """ZeroMQ authenticator used to secure queue access""" config = get_config() self._reset_timeout(config.dragon_server_timeout) self._dragon_head_socket: t.Optional[zmq.Socket[t.Any]] = None + """ZeroMQ socket exposing the connection to the DragonBackend""" self._dragon_head_process: t.Optional[subprocess.Popen[bytes]] = None + """A handle to the process executing the DragonBackend""" # Returned by dragon head, useful if shutdown is to be requested # but process was started by another connector self._dragon_head_pid: t.Optional[int] = None + """Process ID of the process executing the DragonBackend""" self._dragon_server_path = config.dragon_server_path + """Path to a dragon installation""" logger.debug(f"Dragon Server path was set to {self._dragon_server_path}") self._env_vars: t.Dict[str, str] = {} if self._dragon_server_path is None: @@ -95,7 +101,7 @@ def __init__(self) -> None: @property def is_connected(self) -> bool: - """Whether the Connector established a connection to the server + """Whether the Connector established a connection to the server. :return: True if connected """ @@ -104,12 +110,18 @@ def is_connected(self) -> bool: @property def can_monitor(self) -> bool: """Whether the Connector knows the PID of the dragon server head process - and can monitor its status + and can monitor its status. :return: True if the server can be monitored""" return self._dragon_head_pid is not None def _handshake(self, address: str) -> None: + """Perform the handshake process with the DragonBackend and + confirm two-way communication is established. + + :param address: The address of the head node socket to initiate a + handhake with + """ self._dragon_head_socket = dragonSockets.get_secure_socket( self._context, zmq.REQ, False ) @@ -132,6 +144,11 @@ def _handshake(self, address: str) -> None: ) from e def _reset_timeout(self, timeout: int = get_config().dragon_server_timeout) -> None: + """Reset the timeout applied to the ZMQ context. If an authenticator is + enabled, also update the authenticator timeouts. + + :param timeout: The timeout value to apply to ZMQ sockets + """ self._context.setsockopt(zmq.SNDTIMEO, value=timeout) self._context.setsockopt(zmq.RCVTIMEO, value=timeout) if self._authenticator is not None and self._authenticator.thread is not None: @@ -183,11 +200,19 @@ def _get_new_authenticator( @staticmethod def _get_dragon_log_level() -> str: + """Maps the log level from SmartSim to a valid log level + for a dragon process. + + :returns: The dragon log level string + """ smartsim_to_dragon = defaultdict(lambda: "NONE") smartsim_to_dragon["developer"] = "INFO" return smartsim_to_dragon.get(get_config().log_level, "NONE") def _connect_to_existing_server(self, path: Path) -> None: + """Connects to an existing DragonBackend using address information from + a persisted dragon log file. + """ config = get_config() dragon_config_log = path / config.dragon_log_filename @@ -217,6 +242,11 @@ def _connect_to_existing_server(self, path: Path) -> None: return def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]: + """Instantiate the ZMQ socket to be used by the connector. + + :param socket_addr: The socket address the connector should bind to + :returns: The bound socket + """ config = get_config() connector_socket: t.Optional[zmq.Socket[t.Any]] = None self._reset_timeout(config.dragon_server_startup_timeout) @@ -245,9 +275,14 @@ def load_persisted_env(self) -> t.Dict[str, str]: with open(config.dragon_dotenv, encoding="utf-8") as dot_env: for kvp in dot_env.readlines(): - split = kvp.strip().split("=", maxsplit=1) - key, value = split[0], split[-1] - self._env_vars[key] = value + if not kvp: + continue + + # skip any commented lines + if not kvp.startswith("#"): + split = kvp.strip().split("=", maxsplit=1) + key, value = split[0], split[-1] + self._env_vars[key] = value return self._env_vars @@ -418,6 +453,15 @@ def send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse def _parse_launched_dragon_server_info_from_iterable( stream: t.Iterable[str], num_dragon_envs: t.Optional[int] = None ) -> t.List[t.Dict[str, str]]: + """Parses dragon backend connection information from a stream. + + :param stream: The stream to inspect. Usually the stdout of the + DragonBackend process + :param num_dragon_envs: The expected number of dragon environments + to parse from the stream. + :returns: A list of dictionaries, one per environment, containing + the parsed server information + """ lines = (line.strip() for line in stream) lines = (line for line in lines if line) tokenized = (line.split(maxsplit=1) for line in lines) @@ -444,6 +488,15 @@ def _parse_launched_dragon_server_info_from_files( file_paths: t.List[t.Union[str, "os.PathLike[str]"]], num_dragon_envs: t.Optional[int] = None, ) -> t.List[t.Dict[str, str]]: + """Read a known log file into a Stream and parse dragon server configuration + from the stream. + + :param file_paths: Path to a file containing dragon server configuration + :num_dragon_envs: The expected number of dragon environments to be found + in the file + :returns: The parsed server configuration, one item per + discovered dragon environment + """ with fileinput.FileInput(file_paths) as ifstream: dragon_envs = cls._parse_launched_dragon_server_info_from_iterable( ifstream, num_dragon_envs @@ -458,6 +511,15 @@ def _send_req_with_socket( send_flags: int = 0, recv_flags: int = 0, ) -> DragonResponse: + """Sends a synchronous request through a ZMQ socket. + + :param socket: Socket to send on + :param request: The request to send + :param send_flags: Configuration to apply to the send operation + :param recv_flags: Configuration to apply to the recv operation; used to + allow the receiver to immediately respond to the sent request. + :returns: The response from the target + """ client = dragonSockets.as_client(socket) with DRG_LOCK: logger.debug(f"Sending {type(request).__name__}: {request}") @@ -469,6 +531,13 @@ def _send_req_with_socket( def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT: + """Verify that objects can be sent as messages acceptable to the target. + + :param obj: The message to test + :param typ: The type that is acceptable + :returns: The original `obj` if it is of the requested type + :raises TypeError: If the object fails the test and is not + an instance of the desired type""" if not isinstance(obj, typ): raise TypeError(f"Expected schema of type `{typ}`, but got {type(obj)}") return obj @@ -520,6 +589,12 @@ def _dragon_cleanup( def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: + """Determine the applicable dragon server path for the connector + + :param fallback: A default dragon server path to use if one is not + found in the runtime configuration + :returns: The path to the dragon libraries + """ dragon_server_path = get_config().dragon_server_path or os.path.join( fallback, ".smartsim", "dragon" ) diff --git a/smartsim/_core/mli/client/__init__.py b/smartsim/_core/mli/client/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/smartsim/_core/mli/client/protoclient.py b/smartsim/_core/mli/client/protoclient.py new file mode 100644 index 0000000000..46598a8171 --- /dev/null +++ b/smartsim/_core/mli/client/protoclient.py @@ -0,0 +1,348 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# isort: off +# pylint: disable=unused-import,import-error +import dragon +import dragon.channels +from dragon.globalservices.api_setup import connect_to_infrastructure + +try: + from mpi4py import MPI # type: ignore[import-not-found] +except Exception: + MPI = None + print("Unable to import `mpi4py` package") + +# isort: on +# pylint: enable=unused-import,import-error + +import numbers +import os +import time +import typing as t +from collections import OrderedDict + +import numpy +import torch + +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local +from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster +from smartsim._core.mli.infrastructure.comm.event import OnWriteFeatureStore +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim._core.mli.message_handler import MessageHandler +from smartsim._core.utils.timings import PerfTimer +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +_TimingDict = OrderedDict[str, list[str]] + + +logger = get_logger("App") +logger.info("Started app") +CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False + + +class ProtoClient: + """Proof of concept implementation of a client enabling user applications + to interact with MLI resources.""" + + _DEFAULT_BACKBONE_TIMEOUT = 1.0 + """A default timeout period applied to connection attempts with the + backbone feature store.""" + + _DEFAULT_WORK_QUEUE_SIZE = 500 + """A default number of events to be buffered in the work queue before + triggering QueueFull exceptions.""" + + _EVENT_SOURCE = "proto-client" + """A user-friendly name for this class instance to identify + the client as the publisher of an event.""" + + @staticmethod + def _attach_to_backbone() -> BackboneFeatureStore: + """Use the supplied environment variables to attach + to a pre-existing backbone featurestore. Requires the + environment to contain `_SMARTSIM_INFRA_BACKBONE` + environment variable. + + :returns: The attached backbone featurestore + :raises SmartSimError: If the backbone descriptor is not contained + in the appropriate environment variable + """ + descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None) + if descriptor is None or not descriptor: + raise SmartSimError( + "Missing required backbone configuration in environment: " + f"{BackboneFeatureStore.MLI_BACKBONE}" + ) + + backbone = t.cast( + BackboneFeatureStore, BackboneFeatureStore.from_descriptor(descriptor) + ) + return backbone + + def _attach_to_worker_queue(self) -> DragonFLIChannel: + """Wait until the backbone contains the worker queue configuration, + then attach an FLI to the given worker queue. + + :returns: The attached FLI channel + :raises SmartSimError: if the required configuration is not found in the + backbone feature store + """ + + descriptor = "" + try: + # NOTE: without wait_for, this MUST be in the backbone.... + config = self._backbone.wait_for( + [BackboneFeatureStore.MLI_WORKER_QUEUE], self.backbone_timeout + ) + descriptor = str(config[BackboneFeatureStore.MLI_WORKER_QUEUE]) + except Exception as ex: + logger.info( + f"Unable to retrieve {BackboneFeatureStore.MLI_WORKER_QUEUE} " + "to attach to the worker queue." + ) + raise SmartSimError("Unable to locate worker queue using backbone") from ex + + return DragonFLIChannel.from_descriptor(descriptor) + + def _create_broadcaster(self) -> EventBroadcaster: + """Create an EventBroadcaster that broadcasts events to + all MLI components registered to consume them. + + :returns: An EventBroadcaster instance + """ + broadcaster = EventBroadcaster( + self._backbone, DragonCommChannel.from_descriptor + ) + return broadcaster + + def __init__( + self, + timing_on: bool, + backbone_timeout: float = _DEFAULT_BACKBONE_TIMEOUT, + ) -> None: + """Initialize the client instance. + + :param timing_on: Flag indicating if timing information should be + written to file + :param backbone_timeout: Maximum wait time (in seconds) allowed to attach to the + worker queue + :raises SmartSimError: If unable to attach to a backbone featurestore + :raises ValueError: If an invalid backbone timeout is specified + """ + if MPI is not None: + # TODO: determine a way to make MPI work in the test environment + # - consider catching the import exception and defaulting rank to 0 + comm = MPI.COMM_WORLD + rank: int = comm.Get_rank() + else: + rank = 0 + + if backbone_timeout <= 0: + raise ValueError( + f"Invalid backbone timeout provided: {backbone_timeout}. " + "The value must be greater than zero." + ) + self._backbone_timeout = max(backbone_timeout, 0.1) + + connect_to_infrastructure() + + self._backbone = self._attach_to_backbone() + self._backbone.wait_timeout = self.backbone_timeout + self._to_worker_fli = self._attach_to_worker_queue() + + self._from_worker_ch = create_local(self._DEFAULT_WORK_QUEUE_SIZE) + self._to_worker_ch = create_local(self._DEFAULT_WORK_QUEUE_SIZE) + + self._publisher = self._create_broadcaster() + + self.perf_timer: PerfTimer = PerfTimer( + debug=False, timing_on=timing_on, prefix=f"a{rank}_" + ) + self._start: t.Optional[float] = None + self._interm: t.Optional[float] = None + self._timings: _TimingDict = OrderedDict() + self._timing_on = timing_on + + @property + def backbone_timeout(self) -> float: + """The timeout (in seconds) applied to retrievals + from the backbone feature store. + + :returns: A float indicating the number of seconds to allow""" + return self._backbone_timeout + + def _add_label_to_timings(self, label: str) -> None: + """Adds a new label into the timing dictionary to prepare for + receiving timing events. + + :param label: The label to create storage for + """ + if label not in self._timings: + self._timings[label] = [] + + @staticmethod + def _format_number(number: t.Union[numbers.Number, float]) -> str: + """Utility function for formatting numbers consistently for logs. + + :param number: The number to convert to a formatted string + :returns: The formatted string containing the number + """ + return f"{number:0.4e}" + + def start_timings(self, batch_size: numbers.Number) -> None: + """Configure the client to begin storing timing information. + + :param batch_size: The size of batches to generate as inputs + to the model + """ + if self._timing_on: + self._add_label_to_timings("batch_size") + self._timings["batch_size"].append(self._format_number(batch_size)) + self._start = time.perf_counter() + self._interm = time.perf_counter() + + def end_timings(self) -> None: + """Configure the client to stop storing timing information.""" + if self._timing_on and self._start is not None: + self._add_label_to_timings("total_time") + self._timings["total_time"].append( + self._format_number(time.perf_counter() - self._start) + ) + + def measure_time(self, label: str) -> None: + """Measures elapsed time since the last recorded signal. + + :param label: The label to measure time for + """ + if self._timing_on and self._interm is not None: + self._add_label_to_timings(label) + self._timings[label].append( + self._format_number(time.perf_counter() - self._interm) + ) + self._interm = time.perf_counter() + + def print_timings(self, to_file: bool = False) -> None: + """Print timing information to standard output. If `to_file` + is `True`, also write results to a file. + + :param to_file: If `True`, also saves timing information + to the files `timings.npy` and `timings.txt` + """ + print(" ".join(self._timings.keys())) + + value_array = numpy.array(self._timings.values(), dtype=float) + value_array = numpy.transpose(value_array) + for i in range(value_array.shape[0]): + print(" ".join(self._format_number(value) for value in value_array[i])) + if to_file: + numpy.save("timings.npy", value_array) + numpy.savetxt("timings.txt", value_array) + + def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any: + """Execute a batch of inference requests with the supplied ML model. + + :param model: The raw bytes or path to a pytorch model + :param batch: The tensor batch to perform inference on + :returns: The inference results + :raises ValueError: if the worker queue is not configured properly + in the environment variables + """ + tensors = [batch.numpy()] + self.perf_timer.start_timings("batch_size", batch.shape[0]) + built_tensor_desc = MessageHandler.build_tensor_descriptor( + "c", "float32", list(batch.shape) + ) + self.perf_timer.measure_time("build_tensor_descriptor") + if isinstance(model, str): + model_arg = MessageHandler.build_model_key(model, self._backbone.descriptor) + else: + model_arg = MessageHandler.build_model( + model, "resnet-50", "1.0" + ) # type: ignore + request = MessageHandler.build_request( + reply_channel=self._from_worker_ch.descriptor, + model=model_arg, + inputs=[built_tensor_desc], + outputs=[], + output_descriptors=[], + custom_attributes=None, + ) + self.perf_timer.measure_time("build_request") + request_bytes = MessageHandler.serialize_request(request) + self.perf_timer.measure_time("serialize_request") + + if self._to_worker_fli is None: + raise ValueError("No worker queue available.") + + # pylint: disable-next=protected-access + with self._to_worker_fli._channel.sendh( # type: ignore + timeout=None, + stream_channel=self._to_worker_ch.channel, + ) as to_sendh: + to_sendh.send_bytes(request_bytes) + self.perf_timer.measure_time("send_request") + for tensor in tensors: + to_sendh.send_bytes(tensor.tobytes()) # TODO NOT FAST ENOUGH!!! + logger.info(f"Message size: {len(request_bytes)} bytes") + + self.perf_timer.measure_time("send_tensors") + with self._from_worker_ch.channel.recvh(timeout=None) as from_recvh: + resp = from_recvh.recv_bytes(timeout=None) + self.perf_timer.measure_time("receive_response") + response = MessageHandler.deserialize_response(resp) + self.perf_timer.measure_time("deserialize_response") + + # recv depending on the len(response.result.descriptors)? + data_blob: bytes = from_recvh.recv_bytes(timeout=None) + self.perf_timer.measure_time("receive_tensor") + result = torch.from_numpy( + numpy.frombuffer( + data_blob, + dtype=str(response.result.descriptors[0].dataType), + ) + ) + self.perf_timer.measure_time("deserialize_tensor") + + self.perf_timer.end_timings() + return result + + def set_model(self, key: str, model: bytes) -> None: + """Write the supplied model to the feature store. + + :param key: The unique key used to identify the model + :param model: The raw bytes of the model to execute + """ + self._backbone[key] = model + + # notify components of a change in the data at this key + event = OnWriteFeatureStore(self._EVENT_SOURCE, self._backbone.descriptor, key) + self._publisher.send(event) diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index 9a12e4c8dc..104333ce7f 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -26,6 +26,7 @@ import base64 import typing as t +import uuid from abc import ABC, abstractmethod from smartsim.log import get_logger @@ -36,24 +37,31 @@ class CommChannelBase(ABC): """Base class for abstracting a message passing mechanism""" - def __init__(self, descriptor: t.Union[str, bytes]) -> None: + def __init__( + self, + descriptor: str, + name: t.Optional[str] = None, + ) -> None: """Initialize the CommChannel instance. :param descriptor: Channel descriptor """ self._descriptor = descriptor + """An opaque identifier used to connect to an underlying communication channel""" + self._name = name or str(uuid.uuid4()) + """A user-friendly identifier for channel-related logging""" @abstractmethod - def send(self, value: bytes, timeout: float = 0) -> None: + def send(self, value: bytes, timeout: float = 0.001) -> None: """Send a message through the underlying communication channel. - :param timeout: Maximum time to wait (in seconds) for messages to send :param value: The value to send + :param timeout: Maximum time to wait (in seconds) for messages to send :raises SmartSimError: If sending message fails """ @abstractmethod - def recv(self, timeout: float = 0) -> t.List[bytes]: + def recv(self, timeout: float = 0.001) -> t.List[bytes]: """Receives message(s) through the underlying communication channel. :param timeout: Maximum time to wait (in seconds) for messages to arrive @@ -61,11 +69,14 @@ def recv(self, timeout: float = 0) -> t.List[bytes]: """ @property - def descriptor(self) -> bytes: + def descriptor(self) -> str: """Return the channel descriptor for the underlying dragon channel. :returns: Byte encoded channel descriptor """ - if isinstance(self._descriptor, str): - return base64.b64decode(self._descriptor.encode("utf-8")) return self._descriptor + + def __str__(self) -> str: + """Build a string representation of the channel useful for printing.""" + classname = type(self).__class__.__name__ + return f"{classname}('{self._name}', '{self._descriptor}')" diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index 1363c0d675..110f19258a 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -24,65 +24,17 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import base64 -import sys import typing as t import dragon.channels as dch -import dragon.infrastructure.facts as df -import dragon.infrastructure.parameters as dp -import dragon.managed_memory as dm -import dragon.utils as du import smartsim._core.mli.comm.channel.channel as cch +import smartsim._core.mli.comm.channel.dragon_util as drg_util from smartsim.error.errors import SmartSimError from smartsim.log import get_logger logger = get_logger(__name__) -import dragon.channels as dch - -DEFAULT_CHANNEL_BUFFER_SIZE = 500 -"""Maximum number of messages that can be buffered. DragonCommChannel will -raise an exception if no clients consume messages before the buffer is filled.""" - - -def create_local(capacity: int = 0) -> dch.Channel: - """Creates a Channel attached to the local memory pool. - - :param capacity: The number of events the channel can buffer; uses the default - buffer size `DEFAULT_CHANNEL_BUFFER_SIZE` when not supplied - :returns: The instantiated channel - :raises SmartSimError: If unable to attach local channel - """ - pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd)) - channel: t.Optional[dch.Channel] = None - offset = 0 - - capacity = capacity if capacity > 0 else DEFAULT_CHANNEL_BUFFER_SIZE - - while not channel: - # search for an open channel ID - offset += 1 - cid = df.BASE_USER_MANAGED_CUID + offset - try: - channel = dch.Channel( - mem_pool=pool, - c_uid=cid, - capacity=capacity, - ) - logger.debug( - f"Channel {cid} created in pool {pool.serialize()} w/capacity {capacity}" - ) - except Exception as e: - if offset < 100: - logger.warning(f"Unable to attach to channel id {cid}. Retrying...") - else: - logger.error(f"All attempts to attach local channel have failed") - raise SmartSimError("Failed to attach local channel") from e - - return channel - class DragonCommChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon channel.""" @@ -92,10 +44,10 @@ def __init__(self, channel: "dch.Channel") -> None: :param channel: A channel to use for communications """ - serialized_ch = channel.serialize() - descriptor = base64.b64encode(serialized_ch).decode("utf-8") + descriptor = drg_util.channel_to_descriptor(channel) super().__init__(descriptor) self._channel = channel + """The underlying dragon channel used by this CommChannel for communications""" @property def channel(self) -> "dch.Channel": @@ -114,11 +66,11 @@ def send(self, value: bytes, timeout: float = 0.001) -> None: """ try: with self._channel.sendh(timeout=timeout) as sendh: - sendh.send_bytes(value) - logger.debug(f"DragonCommChannel {self.descriptor!r} sent message") + sendh.send_bytes(value, blocking=False) + logger.debug(f"DragonCommChannel {self.descriptor} sent message") except Exception as e: raise SmartSimError( - f"Error sending message: DragonCommChannel {self.descriptor!r}" + f"Error sending via DragonCommChannel {self.descriptor}" ) from e def recv(self, timeout: float = 0.001) -> t.List[bytes]: @@ -133,56 +85,43 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]: try: message_bytes = recvh.recv_bytes(timeout=timeout) messages.append(message_bytes) - logger.debug(f"DragonCommChannel {self.descriptor!r} received message") + logger.debug(f"DragonCommChannel {self.descriptor} received message") except dch.ChannelEmpty: # emptied the queue, ok to swallow this ex - logger.debug(f"DragonCommChannel exhausted: {self.descriptor!r}") - except dch.ChannelRecvTimeout as ex: - logger.debug(f"Timeout exceeded on channel.recv: {self.descriptor!r}") + logger.debug(f"DragonCommChannel exhausted: {self.descriptor}") + except dch.ChannelRecvTimeout: + logger.debug(f"Timeout exceeded on channel.recv: {self.descriptor}") return messages - @property - def descriptor_string(self) -> str: - """Return the channel descriptor for the underlying dragon channel - as a string. Automatically performs base64 encoding to ensure the - string can be used in a call to `from_descriptor`. - - :returns: String representation of channel descriptor - :raises ValueError: If unable to convert descriptor to a string - """ - if isinstance(self._descriptor, str): - return self._descriptor - - if isinstance(self._descriptor, bytes): - return base64.b64encode(self._descriptor).decode("utf-8") - - raise ValueError(f"Unable to convert channel descriptor: {self._descriptor}") - @classmethod def from_descriptor( cls, - descriptor: t.Union[bytes, str], + descriptor: str, ) -> "DragonCommChannel": """A factory method that creates an instance from a descriptor string. - :param descriptor: The descriptor that uniquely identifies the resource. Output - from `descriptor_string` is correctly encoded. + :param descriptor: The descriptor that uniquely identifies the resource. :returns: An attached DragonCommChannel :raises SmartSimError: If creation of comm channel fails """ try: - utf8_descriptor: t.Union[str, bytes] = descriptor - if isinstance(descriptor, str): - utf8_descriptor = descriptor.encode("utf-8") - - # todo: ensure the bytes argument and condition are removed - # after refactoring the RPC models - - actual_descriptor = base64.b64decode(utf8_descriptor) - channel = dch.Channel.attach(actual_descriptor) + channel = drg_util.descriptor_to_channel(descriptor) return DragonCommChannel(channel) except Exception as ex: raise SmartSimError( - f"Failed to create dragon comm channel: {descriptor!r}" + f"Failed to create dragon comm channel: {descriptor}" ) from ex + + @classmethod + def from_local(cls, _descriptor: t.Optional[str] = None) -> "DragonCommChannel": + """A factory method that creates a local channel instance. + + :param _descriptor: Unused placeholder + :returns: An attached DragonCommChannel""" + try: + channel = drg_util.create_local() + return DragonCommChannel(channel) + except: + logger.error(f"Failed to create local dragon comm channel", exc_info=True) + raise diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index 84d809c8ac..5fb0790a84 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -26,19 +26,14 @@ # isort: off from dragon import fli -import dragon.channels as dch -import dragon.infrastructure.facts as df -import dragon.infrastructure.parameters as dp -import dragon.managed_memory as dm -import dragon.utils as du +from dragon.channels import Channel # isort: on -import base64 import typing as t import smartsim._core.mli.comm.channel.channel as cch -from smartsim._core.mli.comm.channel.dragon_channel import create_local +import smartsim._core.mli.comm.channel.dragon_util as drg_util from smartsim.error.errors import SmartSimError from smartsim.log import get_logger @@ -50,36 +45,70 @@ class DragonFLIChannel(cch.CommChannelBase): def __init__( self, - fli_desc: bytes, - sender_supplied: bool = True, - buffer_size: int = 0, + fli_: fli.FLInterface, + buffer_size: int = drg_util.DEFAULT_CHANNEL_BUFFER_SIZE, ) -> None: """Initialize the DragonFLIChannel instance. - :param fli_desc: The descriptor of the FLI channel to attach + :param fli_: The FLIInterface to use as the underlying communications channel :param sender_supplied: Flag indicating if the FLI uses sender-supplied streams :param buffer_size: Maximum number of sent messages that can be buffered """ - super().__init__(fli_desc) - self._fli: "fli" = fli.FLInterface.attach(fli_desc) - self._channel: t.Optional["dch"] = ( - create_local(buffer_size) if sender_supplied else None - ) + descriptor = drg_util.channel_to_descriptor(fli_) + super().__init__(descriptor) + + self._channel: t.Optional["Channel"] = None + """The underlying dragon Channel used by a sender-side DragonFLIChannel + to attach to the main FLI channel""" + + self._fli = fli_ + """The underlying dragon FLInterface used by this CommChannel for communications""" + self._buffer_size: int = buffer_size + """Maximum number of messages that can be buffered before sending""" def send(self, value: bytes, timeout: float = 0.001) -> None: """Send a message through the underlying communication channel. - :param timeout: Maximum time to wait (in seconds) for messages to send :param value: The value to send + :param timeout: Maximum time to wait (in seconds) for messages to send :raises SmartSimError: If sending message fails """ try: + if self._channel is None: + self._channel = drg_util.create_local(self._buffer_size) + with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: sendh.send_bytes(value, timeout=timeout) - logger.debug(f"DragonFLIChannel {self.descriptor!r} sent message") + logger.debug(f"DragonFLIChannel {self.descriptor} sent message") + except Exception as e: + self._channel = None + raise SmartSimError( + f"Error sending via DragonFLIChannel {self.descriptor}" + ) from e + + def send_multiple( + self, + values: t.Sequence[bytes], + timeout: float = 0.001, + ) -> None: + """Send a message through the underlying communication channel. + + :param values: The values to send + :param timeout: Maximum time to wait (in seconds) for messages to send + :raises SmartSimError: If sending message fails + """ + try: + if self._channel is None: + self._channel = drg_util.create_local(self._buffer_size) + + with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: + for value in values: + sendh.send_bytes(value) + logger.debug(f"DragonFLIChannel {self.descriptor} sent message") except Exception as e: + self._channel = None raise SmartSimError( - f"Error sending message: DragonFLIChannel {self.descriptor!r}" + f"Error sending via DragonFLIChannel {self.descriptor} {e}" ) from e def recv(self, timeout: float = 0.001) -> t.List[bytes]: @@ -96,14 +125,13 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]: try: message, _ = recvh.recv_bytes(timeout=timeout) messages.append(message) - logger.debug( - f"DragonFLIChannel {self.descriptor!r} received message" - ) + logger.debug(f"DragonFLIChannel {self.descriptor} received message") except fli.FLIEOT: eot = True + logger.debug(f"DragonFLIChannel exhausted: {self.descriptor}") except Exception as e: raise SmartSimError( - f"Error receiving messages: DragonFLIChannel {self.descriptor!r}" + f"Error receiving messages: DragonFLIChannel {self.descriptor}" ) from e return messages @@ -116,13 +144,14 @@ def from_descriptor( :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached DragonFLIChannel - :raises SmartSimError: If creation of DragonFLIChanenel fails + :raises SmartSimError: If creation of DragonFLIChannel fails + :raises ValueError: If the descriptor is invalid """ + if not descriptor: + raise ValueError("Invalid descriptor provided") + try: - return DragonFLIChannel( - fli_desc=base64.b64decode(descriptor), - sender_supplied=True, - ) + return DragonFLIChannel(fli_=drg_util.descriptor_to_fli(descriptor)) except Exception as e: raise SmartSimError( f"Error while creating DragonFLIChannel: {descriptor}" diff --git a/smartsim/_core/mli/comm/channel/dragon_util.py b/smartsim/_core/mli/comm/channel/dragon_util.py new file mode 100644 index 0000000000..8517979ec4 --- /dev/null +++ b/smartsim/_core/mli/comm/channel/dragon_util.py @@ -0,0 +1,131 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import base64 +import binascii +import typing as t + +import dragon.channels as dch +import dragon.fli as fli +import dragon.managed_memory as dm + +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +logger = get_logger(__name__) + +DEFAULT_CHANNEL_BUFFER_SIZE = 500 +"""Maximum number of messages that can be buffered. DragonCommChannel will +raise an exception if no clients consume messages before the buffer is filled.""" + +LAST_OFFSET = 0 +"""The last offset used to create a local channel. This is used to avoid +unnecessary retries when creating a local channel.""" + + +def channel_to_descriptor(channel: t.Union[dch.Channel, fli.FLInterface]) -> str: + """Convert a dragon channel to a descriptor string. + + :param channel: The dragon channel to convert + :returns: The descriptor string + :raises ValueError: If a dragon channel is not provided + """ + if channel is None: + raise ValueError("Channel is not available to create a descriptor") + + serialized_ch = channel.serialize() + return base64.b64encode(serialized_ch).decode("utf-8") + + +def pool_to_descriptor(pool: dm.MemoryPool) -> str: + """Convert a dragon memory pool to a descriptor string. + + :param pool: The memory pool to convert + :returns: The descriptor string + :raises ValueError: If a memory pool is not provided + """ + if pool is None: + raise ValueError("Memory pool is not available to create a descriptor") + + serialized_pool = pool.serialize() + return base64.b64encode(serialized_pool).decode("utf-8") + + +def descriptor_to_fli(descriptor: str) -> "fli.FLInterface": + """Create and attach a new FLI instance given + the string-encoded descriptor. + + :param descriptor: The descriptor of an FLI to attach to + :returns: The attached dragon FLI + :raises ValueError: If the descriptor is empty or incorrectly formatted + :raises SmartSimError: If attachment using the descriptor fails + """ + if len(descriptor) < 1: + raise ValueError("Descriptors may not be empty") + + try: + encoded = descriptor.encode("utf-8") + descriptor_ = base64.b64decode(encoded) + return fli.FLInterface.attach(descriptor_) + except binascii.Error: + raise ValueError("The descriptor was not properly base64 encoded") + except fli.DragonFLIError: + raise SmartSimError("The descriptor did not address an available FLI") + + +def descriptor_to_channel(descriptor: str) -> dch.Channel: + """Create and attach a new Channel instance given + the string-encoded descriptor. + + :param descriptor: The descriptor of a channel to attach to + :returns: The attached dragon Channel + :raises ValueError: If the descriptor is empty or incorrectly formatted + :raises SmartSimError: If attachment using the descriptor fails + """ + if len(descriptor) < 1: + raise ValueError("Descriptors may not be empty") + + try: + encoded = descriptor.encode("utf-8") + descriptor_ = base64.b64decode(encoded) + return dch.Channel.attach(descriptor_) + except binascii.Error: + raise ValueError("The descriptor was not properly base64 encoded") + except dch.ChannelError: + raise SmartSimError("The descriptor did not address an available channel") + + +def create_local(_capacity: int = 0) -> dch.Channel: + """Creates a Channel attached to the local memory pool. Replacement for + direct calls to `dch.Channel.make_process_local()` to enable + supplying a channel capacity. + + :param _capacity: The number of events the channel can buffer; uses the default + buffer size `DEFAULT_CHANNEL_BUFFER_SIZE` when not supplied + :returns: The instantiated channel + """ + channel = dch.Channel.make_process_local() + return channel diff --git a/smartsim/_core/mli/infrastructure/comm/__init__.py b/smartsim/_core/mli/infrastructure/comm/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/smartsim/_core/mli/infrastructure/comm/broadcaster.py b/smartsim/_core/mli/infrastructure/comm/broadcaster.py new file mode 100644 index 0000000000..56dcf549f7 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/comm/broadcaster.py @@ -0,0 +1,239 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t +import uuid +from collections import defaultdict, deque + +from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim._core.mli.infrastructure.comm.event import EventBase +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class BroadcastResult(t.NamedTuple): + """Contains summary details about a broadcast.""" + + num_sent: int + """The total number of messages delivered across all consumers""" + num_failed: int + """The total number of messages not delivered across all consumers""" + + +class EventBroadcaster: + """Performs fan-out publishing of system events.""" + + def __init__( + self, + backbone: BackboneFeatureStore, + channel_factory: t.Optional[t.Callable[[str], CommChannelBase]] = None, + name: t.Optional[str] = None, + ) -> None: + """Initialize the EventPublisher instance. + + :param backbone: The MLI backbone feature store + :param channel_factory: Factory method to construct new channel instances + :param name: A user-friendly name for logging. If not provided, an + auto-generated GUID will be used + """ + self._backbone = backbone + """The backbone feature store used to retrieve consumer descriptors""" + self._channel_factory = channel_factory + """A factory method used to instantiate channels from descriptors""" + self._channel_cache: t.Dict[str, t.Optional[CommChannelBase]] = defaultdict( + lambda: None + ) + """A mapping of instantiated channels that can be re-used. Automatically + calls the channel factory if a descriptor is not already in the collection""" + self._event_buffer: t.Deque[EventBase] = deque() + """A buffer for storing events when a consumer list is not found""" + self._descriptors: t.Set[str] + """Stores the most recent list of broadcast consumers. Updated automatically + on each broadcast""" + self._name = name or str(uuid.uuid4()) + """A unique identifer assigned to the broadcaster for logging""" + + @property + def name(self) -> str: + """The friendly name assigned to the broadcaster. + + :returns: The broadcaster name if one is assigned, otherwise a unique + id assigned by the system. + """ + return self._name + + @property + def num_buffered(self) -> int: + """Return the number of events currently buffered to send. + + :returns: Number of buffered events + """ + return len(self._event_buffer) + + def _save_to_buffer(self, event: EventBase) -> None: + """Places the event in the buffer to be sent once a consumer + list is available. + + :param event: The event to buffer + :raises ValueError: If the event cannot be buffered + """ + try: + self._event_buffer.append(event) + logger.debug(f"Buffered event {event=}") + except Exception as ex: + raise ValueError( + f"Unable to buffer event {event} in broadcaster {self.name}" + ) from ex + + def _log_broadcast_start(self) -> None: + """Logs broadcast statistics.""" + num_events = len(self._event_buffer) + num_copies = len(self._descriptors) + logger.debug( + f"Broadcast {num_events} events to {num_copies} consumers from {self.name}" + ) + + def _prune_unused_consumers(self) -> None: + """Performs maintenance on the channel cache by pruning any channel + that has been removed from the consumers list.""" + active_consumers = set(self._descriptors) + current_channels = set(self._channel_cache.keys()) + + # find any cached channels that are now unused + inactive_channels = current_channels.difference(active_consumers) + new_channels = active_consumers.difference(current_channels) + + for descriptor in inactive_channels: + self._channel_cache.pop(descriptor) + + logger.debug( + f"Pruning {len(inactive_channels)} stale consumers and" + f" found {len(new_channels)} new channels for {self.name}" + ) + + def _get_comm_channel(self, descriptor: str) -> CommChannelBase: + """Helper method to build and cache a comm channel. + + :param descriptor: The descriptor to pass to the channel factory + :returns: The instantiated channel + :raises SmartSimError: If the channel fails to attach + """ + comm_channel = self._channel_cache[descriptor] + if comm_channel is not None: + return comm_channel + + if self._channel_factory is None: + raise SmartSimError("No channel factory provided for consumers") + + try: + channel = self._channel_factory(descriptor) + self._channel_cache[descriptor] = channel + return channel + except Exception as ex: + msg = f"Unable to construct channel with descriptor: {descriptor}" + logger.error(msg, exc_info=True) + raise SmartSimError(msg) from ex + + def _get_next_event(self) -> t.Optional[EventBase]: + """Pop the next event to be sent from the queue. + + :returns: The next event to send if any events are enqueued, otherwise `None`. + """ + try: + return self._event_buffer.popleft() + except IndexError: + logger.debug(f"Broadcast buffer exhausted for {self.name}") + + return None + + def _broadcast(self, timeout: float = 0.001) -> BroadcastResult: + """Broadcasts all buffered events to registered event consumers. + + :param timeout: Maximum time to wait (in seconds) for messages to send + :returns: BroadcastResult containing the number of messages that were + successfully and unsuccessfully sent for all consumers + :raises SmartSimError: If the channel fails to attach or broadcasting fails + """ + # allow descriptors to be empty since events are buffered + self._descriptors = set(x for x in self._backbone.notification_channels if x) + if not self._descriptors: + msg = f"No event consumers are registered for {self.name}" + logger.warning(msg) + return BroadcastResult(0, 0) + + self._prune_unused_consumers() + self._log_broadcast_start() + + num_listeners = len(self._descriptors) + num_sent = 0 + num_failures = 0 + + # send each event to every consumer + while event := self._get_next_event(): + logger.debug(f"Broadcasting {event=} to {num_listeners} listeners") + event_bytes = bytes(event) + + for i, descriptor in enumerate(self._descriptors): + comm_channel = self._get_comm_channel(descriptor) + + try: + comm_channel.send(event_bytes, timeout) + num_sent += 1 + except Exception: + msg = ( + f"Broadcast {i+1}/{num_listeners} for event {event.uid} to " + f"channel {descriptor} from {self.name} failed." + ) + logger.exception(msg) + num_failures += 1 + + return BroadcastResult(num_sent, num_failures) + + def send(self, event: EventBase, timeout: float = 0.001) -> int: + """Implementation of `send` method of the `EventPublisher` protocol. Publishes + the supplied event to all registered broadcast consumers. + + :param event: An event to publish + :param timeout: Maximum time to wait (in seconds) for messages to send + :returns: The total number of events successfully published to consumers + :raises ValueError: If event serialization fails + :raises AttributeError: If event cannot be serialized + :raises KeyError: If channel fails to attach using registered descriptors + :raises SmartSimError: If any unexpected error occurs during send + """ + try: + self._save_to_buffer(event) + result = self._broadcast(timeout) + return result.num_sent + except (KeyError, ValueError, AttributeError, SmartSimError): + raise + except Exception as ex: + raise SmartSimError("An unexpected failure occurred while sending") from ex diff --git a/smartsim/_core/mli/infrastructure/comm/consumer.py b/smartsim/_core/mli/infrastructure/comm/consumer.py new file mode 100644 index 0000000000..08b5c47852 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/comm/consumer.py @@ -0,0 +1,281 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pickle +import time +import typing as t +import uuid + +from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.infrastructure.comm.event import ( + EventBase, + OnCreateConsumer, + OnRemoveConsumer, + OnShutdownRequested, +) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class EventConsumer: + """Reads system events published to a communications channel.""" + + _BACKBONE_WAIT_TIMEOUT = 10.0 + """Maximum time (in seconds) to wait for the backbone to register the consumer""" + + def __init__( + self, + comm_channel: CommChannelBase, + backbone: BackboneFeatureStore, + filters: t.Optional[t.List[str]] = None, + name: t.Optional[str] = None, + event_handler: t.Optional[t.Callable[[EventBase], None]] = None, + ) -> None: + """Initialize the EventConsumer instance. + + :param comm_channel: Communications channel to listen to for events + :param backbone: The MLI backbone feature store + :param filters: A list of event types to deliver. when empty, all + events will be delivered + :param name: A user-friendly name for logging. If not provided, an + auto-generated GUID will be used + """ + self._comm_channel = comm_channel + """The comm channel used by the consumer to receive messages. The channel + descriptor will be published for senders to discover.""" + self._backbone = backbone + """The backbone instance used to bootstrap the instance. The EventConsumer + uses the backbone to discover where it can publish its descriptor.""" + self._global_filters = filters or [] + """A set of global filters to apply to incoming events. Global filters are + combined with per-call filters. Filters act as an allow-list.""" + self._name = name or str(uuid.uuid4()) + """User-friendly name assigned to a consumer for logging. Automatically + assigned if not provided.""" + self._event_handler = event_handler + """The function that should be executed when an event + passed by the filters is received.""" + self.listening = True + """Flag indicating that the consumer is currently listening for new + events. Setting this flag to `False` will cause any active calls to + `listen` to terminate.""" + + @property + def descriptor(self) -> str: + """The descriptor of the underlying comm channel. + + :returns: The comm channel descriptor""" + return self._comm_channel.descriptor + + @property + def name(self) -> str: + """The friendly name assigned to the consumer. + + :returns: The consumer name if one is assigned, otherwise a unique + id assigned by the system. + """ + return self._name + + def recv( + self, + filters: t.Optional[t.List[str]] = None, + timeout: float = 0.001, + batch_timeout: float = 1.0, + ) -> t.List[EventBase]: + """Receives available published event(s). + + :param filters: Additional filters to add to the global filters configured + on the EventConsumer instance + :param timeout: Maximum time to wait for a single message to arrive + :param batch_timeout: Maximum time to wait for messages to arrive; allows + multiple batches to be retrieved in one call to `send` + :returns: A list of events that pass any configured filters + :raises ValueError: If a positive, non-zero value is not provided for the + timeout or batch_timeout. + """ + if filters is None: + filters = [] + + if timeout is not None and timeout <= 0: + raise ValueError("request timeout must be a non-zero, positive value") + + if batch_timeout is not None and batch_timeout <= 0: + raise ValueError("batch_timeout must be a non-zero, positive value") + + filter_set = {*self._global_filters, *filters} + all_message_bytes: t.List[bytes] = [] + + # firehose as many messages as possible within the batch_timeout + start_at = time.time() + remaining = batch_timeout + + batch_message_bytes = self._comm_channel.recv(timeout=timeout) + while batch_message_bytes: + # remove any empty messages that will fail to decode + all_message_bytes.extend(batch_message_bytes) + batch_message_bytes = [] + + # avoid getting stuck indefinitely waiting for the channel + elapsed = time.time() - start_at + remaining = batch_timeout - elapsed + + if remaining > 0: + batch_message_bytes = self._comm_channel.recv(timeout=timeout) + + events_received: t.List[EventBase] = [] + + # Timeout elapsed or no messages received - return the empty list + if not all_message_bytes: + return events_received + + for message in all_message_bytes: + if not message or message is None: + continue + + event = pickle.loads(message) + if not event: + logger.warning(f"Consumer {self.name} is unable to unpickle message") + continue + + # skip events that don't pass a filter + if filter_set and event.category not in filter_set: + continue + + events_received.append(event) + + return events_received + + def _send_to_registrar(self, event: EventBase) -> None: + """Send an event direct to the registrar listener.""" + registrar_key = BackboneFeatureStore.MLI_REGISTRAR_CONSUMER + config = self._backbone.wait_for([registrar_key], self._BACKBONE_WAIT_TIMEOUT) + registrar_descriptor = str(config.get(registrar_key, None)) + + if not registrar_descriptor: + logger.warning( + f"Unable to send {event.category} from {self.name}. " + "No registrar channel found." + ) + return + + logger.debug(f"Sending {event.category} from {self.name}") + + registrar_channel = DragonCommChannel.from_descriptor(registrar_descriptor) + registrar_channel.send(bytes(event), timeout=1.0) + + logger.debug(f"{event.category} from {self.name} sent") + + def register(self) -> None: + """Send an event to register this consumer as a listener.""" + descriptor = self._comm_channel.descriptor + event = OnCreateConsumer(self.name, descriptor, self._global_filters) + + self._send_to_registrar(event) + + def unregister(self) -> None: + """Send an event to un-register this consumer as a listener.""" + descriptor = self._comm_channel.descriptor + event = OnRemoveConsumer(self.name, descriptor) + + self._send_to_registrar(event) + + def _on_handler_missing(self, event: EventBase) -> None: + """A "dead letter" event handler that is called to perform + processing on events before they're discarded. + + :param event: The event to handle + """ + logger.warning( + "No event handler is registered in consumer " + f"{self.name}. Discarding {event=}" + ) + + def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None: + """Receives messages for the consumer a single time. Delivers + all messages that pass the consumer filters. Shutdown requests + are handled by a default event handler. + + + NOTE: Executes a single batch-retrieval to receive the maximum + number of messages available under batch timeout. To continually + listen, use `listen` in a non-blocking thread/process + + :param timeout: Maximum time to wait (in seconds) for a message to arrive + :param timeout: Maximum time to wait (in seconds) for a batch to arrive + """ + logger.info( + f"Consumer {self.name} listening with {timeout} second timeout" + f" on channel {self._comm_channel.descriptor}" + ) + + if not self._event_handler: + logger.info("Unable to handle messages. No event handler is registered.") + + incoming_messages = self.recv(timeout=timeout, batch_timeout=batch_timeout) + + if not incoming_messages: + logger.info(f"Consumer {self.name} received empty message list") + + for message in incoming_messages: + logger.info(f"Consumer {self.name} is handling event {message=}") + self._handle_shutdown(message) + + if self._event_handler: + self._event_handler(message) + else: + self._on_handler_missing(message) + + def _handle_shutdown(self, event: EventBase) -> bool: + """Handles shutdown requests sent to the consumer by setting the + `self.listener` property to `False`. + + :param event: The event to handle + :returns: A bool indicating if the event was a shutdown request + """ + if isinstance(event, OnShutdownRequested): + logger.debug(f"Shutdown requested from: {event.source}") + self.listening = False + return True + return False + + def listen(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None: + """Receives messages for the consumer until a shutdown request is received. + + :param timeout: Maximum time to wait (in seconds) for a message to arrive + :param batch_timeout: Maximum time to wait (in seconds) for a batch to arrive + """ + + logger.debug(f"Consumer {self.name} is now listening for events.") + + while self.listening: + self.listen_once(timeout, batch_timeout) + + logger.debug(f"Consumer {self.name} is no longer listening.") diff --git a/smartsim/_core/mli/infrastructure/comm/event.py b/smartsim/_core/mli/infrastructure/comm/event.py new file mode 100644 index 0000000000..ccef9f9b86 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/comm/event.py @@ -0,0 +1,162 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pickle +import typing as t +import uuid +from dataclasses import dataclass, field + +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +@dataclass +class EventBase: + """Core API for an event.""" + + category: str + """Unique category name for an event class""" + source: str + """A unique identifier for the publisher of the event""" + uid: str = field(default_factory=lambda: str(uuid.uuid4())) + """A unique identifier for this event""" + + def __bytes__(self) -> bytes: + """Default conversion to bytes for an event required to publish + messages using byte-oriented communication channels. + + :returns: This entity encoded as bytes""" + return pickle.dumps(self) + + def __str__(self) -> str: + """Convert the event to a string. + + :returns: A string representation of this instance""" + return f"{self.uid}|{self.category}" + + +class OnShutdownRequested(EventBase): + """Publish this event to trigger the listener to shutdown.""" + + SHUTDOWN: t.ClassVar[str] = "consumer-unregister" + """Unique category name for an event raised when a new consumer is unregistered""" + + def __init__(self, source: str) -> None: + """Initialize the event instance. + + :param source: A unique identifier for the publisher of the event + creating the event + """ + super().__init__(self.SHUTDOWN, source) + + +class OnCreateConsumer(EventBase): + """Publish this event when a new event consumer registration is required.""" + + descriptor: str + """Descriptor of the comm channel exposed by the consumer""" + filters: t.List[str] = field(default_factory=list) + """The collection of filters indicating messages of interest to this consumer""" + + CONSUMER_CREATED: t.ClassVar[str] = "consumer-created" + """Unique category name for an event raised when a new consumer is registered""" + + def __init__(self, source: str, descriptor: str, filters: t.Sequence[str]) -> None: + """Initialize the event instance. + + :param source: A unique identifier for the publisher of the event + :param descriptor: Descriptor of the comm channel exposed by the consumer + :param filters: Collection of filters indicating messages of interest + """ + super().__init__(self.CONSUMER_CREATED, source) + self.descriptor = descriptor + self.filters = list(filters) + + def __str__(self) -> str: + """Convert the event to a string. + + :returns: A string representation of this instance + """ + _filters = ",".join(self.filters) + return f"{str(super())}|{self.descriptor}|{_filters}" + + +class OnRemoveConsumer(EventBase): + """Publish this event when a consumer is shutting down and + should be removed from notification lists.""" + + descriptor: str + """Descriptor of the comm channel exposed by the consumer""" + + CONSUMER_REMOVED: t.ClassVar[str] = "consumer-removed" + """Unique category name for an event raised when a new consumer is unregistered""" + + def __init__(self, source: str, descriptor: str) -> None: + """Initialize the OnRemoveConsumer event. + + :param source: A unique identifier for the publisher of the event + :param descriptor: Descriptor of the comm channel exposed by the consumer + """ + super().__init__(self.CONSUMER_REMOVED, source) + self.descriptor = descriptor + + def __str__(self) -> str: + """Convert the event to a string. + + :returns: A string representation of this instance + """ + return f"{str(super())}|{self.descriptor}" + + +class OnWriteFeatureStore(EventBase): + """Publish this event when a feature store key is written.""" + + descriptor: str + """The descriptor of the feature store where the write occurred""" + key: str + """The key identifying where the write occurred""" + + FEATURE_STORE_WRITTEN: str = "feature-store-written" + """Event category for an event raised when a feature store key is written""" + + def __init__(self, source: str, descriptor: str, key: str) -> None: + """Initialize the OnWriteFeatureStore event. + + :param source: A unique identifier for the publisher of the event + :param descriptor: The descriptor of the feature store where the write occurred + :param key: The key identifying where the write occurred + """ + super().__init__(self.FEATURE_STORE_WRITTEN, source) + self.descriptor = descriptor + self.key = key + + def __str__(self) -> str: + """Convert the event to a string. + + :returns: A string representation of this instance + """ + return f"{str(super())}|{self.descriptor}|{self.key}" diff --git a/smartsim/_core/mli/infrastructure/comm/producer.py b/smartsim/_core/mli/infrastructure/comm/producer.py new file mode 100644 index 0000000000..2d8a7c14ad --- /dev/null +++ b/smartsim/_core/mli/infrastructure/comm/producer.py @@ -0,0 +1,44 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + +from smartsim._core.mli.infrastructure.comm.event import EventBase +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class EventProducer(t.Protocol): + """Core API of a class that publishes events.""" + + def send(self, event: EventBase, timeout: float = 0.001) -> int: + """Send an event using the configured comm channel. + + :param event: The event to send + :param timeout: Maximum time to wait (in seconds) for messages to send + :returns: The number of messages that were sent + """ diff --git a/smartsim/_core/mli/infrastructure/control/error_handling.py b/smartsim/_core/mli/infrastructure/control/error_handling.py index 8961cac543..a75f533a37 100644 --- a/smartsim/_core/mli/infrastructure/control/error_handling.py +++ b/smartsim/_core/mli/infrastructure/control/error_handling.py @@ -48,7 +48,7 @@ def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: return MessageHandler.build_response( status=status, message=message, - result=[], + result=None, custom_attributes=None, ) diff --git a/smartsim/_core/mli/infrastructure/control/listener.py b/smartsim/_core/mli/infrastructure/control/listener.py new file mode 100644 index 0000000000..56a7b12d34 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/control/listener.py @@ -0,0 +1,352 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# isort: off +# pylint: disable=import-error +# pylint: disable=unused-import +import socket +import dragon + +# pylint: enable=unused-import +# pylint: enable=import-error +# isort: on + +import argparse +import multiprocessing as mp +import os +import sys +import typing as t + +from smartsim._core.entrypoints.service import Service +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local +from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer +from smartsim._core.mli.infrastructure.comm.event import ( + EventBase, + OnCreateConsumer, + OnRemoveConsumer, + OnShutdownRequested, +) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class ConsumerRegistrationListener(Service): + """A long-running service that manages the list of consumers receiving + events that are broadcast. It hosts handlers for adding and removing consumers + """ + + def __init__( + self, + backbone: BackboneFeatureStore, + timeout: float, + batch_timeout: float, + as_service: bool = False, + cooldown: int = 0, + health_check_frequency: float = 60.0, + ) -> None: + """Initialize the EventListener. + + :param backbone: The backbone feature store + :param timeout: Maximum time (in seconds) to allow a single recv request to wait + :param batch_timeout: Maximum time (in seconds) to allow a batch of receives to + continue to build + :param as_service: Specifies run-once or run-until-complete behavior of service + :param cooldown: Number of seconds to wait before shutting down after + shutdown criteria are met + """ + super().__init__( + as_service, cooldown, health_check_frequency=health_check_frequency + ) + self._timeout = timeout + """ Maximum time (in seconds) to allow a single recv request to wait""" + self._batch_timeout = batch_timeout + """Maximum time (in seconds) to allow a batch of receives to + continue to build""" + self._consumer: t.Optional[EventConsumer] = None + """The event consumer that handles receiving events""" + self._backbone = backbone + """A standalone, system-created feature store used to share internal + information among MLI components""" + + def _on_start(self) -> None: + """Called on initial entry into Service `execute` event loop before + `_on_iteration` is invoked.""" + super()._on_start() + self._create_eventing() + + def _on_shutdown(self) -> None: + """Release dragon resources. Called immediately after exiting + the main event loop during automatic shutdown.""" + super()._on_shutdown() + + if not self._consumer: + return + + # remove descriptor for this listener from the backbone if it's there + if registered_consumer := self._backbone.backend_channel: + # if there is a descriptor in the backbone and it's still this listener + if registered_consumer == self._consumer.descriptor: + logger.info( + f"Listener clearing backend consumer {self._consumer.name} " + "from backbone" + ) + + # unregister this listener in the backbone + self._backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER) + + # TODO: need the channel to be cleaned up + # self._consumer._comm_channel._channel.destroy() + + def _on_iteration(self) -> None: + """Executes calls to the machine learning worker implementation to complete + the inference pipeline.""" + + if self._consumer is None: + logger.info("Unable to listen. No consumer available.") + return + + self._consumer.listen_once(self._timeout, self._batch_timeout) + + def _can_shutdown(self) -> bool: + """Determines if the event consumer is ready to stop listening. + + :returns: True when criteria to shutdown the service are met, False otherwise + """ + + if self._backbone is None: + logger.info("Listener must shutdown. No backbone attached") + return True + + if self._consumer is None: + logger.info("Listener must shutdown. No consumer channel created") + return True + + if not self._consumer.listening: + logger.info( + f"Listener can shutdown. Consumer `{self._consumer.name}` " + "is not listening" + ) + return True + + return False + + def _on_unregister(self, event: OnRemoveConsumer) -> None: + """Event handler for updating the backbone when event consumers + are un-registered. + + :param event: The event that was received + """ + notify_list = set(self._backbone.notification_channels) + + # remove the descriptor specified in the event + if event.descriptor in notify_list: + logger.debug(f"Removing notify consumer: {event.descriptor}") + notify_list.remove(event.descriptor) + + # push the updated list back into the backbone + self._backbone.notification_channels = list(notify_list) + + def _on_register(self, event: OnCreateConsumer) -> None: + """Event handler for updating the backbone when new event consumers + are registered. + + :param event: The event that was received + """ + notify_list = set(self._backbone.notification_channels) + logger.debug(f"Adding notify consumer: {event.descriptor}") + notify_list.add(event.descriptor) + self._backbone.notification_channels = list(notify_list) + + def _on_event_received(self, event: EventBase) -> None: + """Primary event handler for the listener. Distributes events to + type-specific handlers. + + :param event: The event that was received + """ + if self._backbone is None: + logger.info("Unable to handle event. Backbone is missing.") + + if isinstance(event, OnCreateConsumer): + self._on_register(event) + elif isinstance(event, OnRemoveConsumer): + self._on_unregister(event) + else: + logger.info( + "Consumer registration listener received an " + f"unexpected event: {event=}" + ) + + def _on_health_check(self) -> None: + """Check if this consumer has been replaced by a new listener + and automatically trigger a shutdown. Invoked based on the + value of `self._health_check_frequency`.""" + super()._on_health_check() + + try: + logger.debug("Retrieving registered listener descriptor") + descriptor = self._backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] + except KeyError: + descriptor = None + if self._consumer: + self._consumer.listening = False + + if self._consumer and descriptor != self._consumer.descriptor: + logger.warning( + f"Consumer `{self._consumer.name}` for `ConsumerRegistrationListener` " + "is no longer registered. It will automatically shut down." + ) + self._consumer.listening = False + + def _publish_consumer(self) -> None: + """Publish the registrar consumer descriptor to the backbone.""" + if self._consumer is None: + logger.warning("No registrar consumer descriptor available to publisher") + return + + logger.debug(f"Publishing {self._consumer.descriptor} to backbone") + self._backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] = ( + self._consumer.descriptor + ) + + def _create_eventing(self) -> EventConsumer: + """ + Create an event publisher and event consumer for communicating with + other MLI resources. + + NOTE: the backbone must be initialized before connecting eventing clients. + + :returns: The newly created EventConsumer instance + :raises SmartSimError: If a listener channel cannot be created + """ + + if self._consumer: + return self._consumer + + logger.info("Creating event consumer") + + dragon_channel = create_local(500) + event_channel = DragonCommChannel(dragon_channel) + + if not event_channel.descriptor: + raise SmartSimError( + "Unable to generate the descriptor for the event channel" + ) + + self._consumer = EventConsumer( + event_channel, + self._backbone, + [ + OnCreateConsumer.CONSUMER_CREATED, + OnRemoveConsumer.CONSUMER_REMOVED, + OnShutdownRequested.SHUTDOWN, + ], + name=f"ConsumerRegistrar.{socket.gethostname()}", + event_handler=self._on_event_received, + ) + self._publish_consumer() + + logger.info( + f"Backend consumer `{self._consumer.name}` created: " + f"{self._consumer.descriptor}" + ) + + return self._consumer + + +def _create_parser() -> argparse.ArgumentParser: + """ + Create an argument parser that contains the arguments + required to start the listener as a new process: + + --timeout + --batch_timeout + + :returns: A configured parser + """ + arg_parser = argparse.ArgumentParser(prog="ConsumerRegistrarEventListener") + + arg_parser.add_argument("--timeout", type=float, default=1.0) + arg_parser.add_argument("--batch_timeout", type=float, default=1.0) + + return arg_parser + + +def _connect_backbone() -> t.Optional[BackboneFeatureStore]: + """ + Load the backbone by retrieving the descriptor from environment variables. + + :returns: The backbone feature store + :raises SmartSimError: if a descriptor is not found + """ + descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, "") + + if not descriptor: + return None + + logger.info(f"Listener backbone descriptor: {descriptor}\n") + + # `from_writable_descriptor` ensures we can update the backbone + return BackboneFeatureStore.from_writable_descriptor(descriptor) + + +if __name__ == "__main__": + mp.set_start_method("dragon") + + parser = _create_parser() + args = parser.parse_args() + + backbone_fs = _connect_backbone() + + if backbone_fs is None: + logger.error( + "Unable to attach to the backbone without the " + f"`{BackboneFeatureStore.MLI_BACKBONE}` environment variable." + ) + sys.exit(1) + + logger.debug(f"Listener attached to backbone: {backbone_fs.descriptor}") + + listener = ConsumerRegistrationListener( + backbone_fs, + float(args.timeout), + float(args.batch_timeout), + as_service=True, + ) + + logger.info(f"listener created? {listener}") + + try: + listener.execute() + sys.exit(0) + except Exception: + logger.exception("An error occurred in the event listener") + sys.exit(1) diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index 67797fe448..e22a2c8f62 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -142,13 +142,22 @@ def ready(self) -> bool: :returns: True if the queue can be flushed, False otherwise """ if self.empty(): + logger.debug("Request dispatcher queue is empty") return False - timed_out = ( - self._batch_timeout > 0 and self._elapsed_time >= self._batch_timeout - ) - logger.debug(f"Is full: {self.full()} or has timed out: {timed_out}") - return self.full() or timed_out + timed_out = False + if self._batch_timeout >= 0: + timed_out = self._elapsed_time >= self._batch_timeout + + if self.full(): + logger.debug("Request dispatcher ready to deliver full batch") + return True + + if timed_out: + logger.debug("Request dispatcher delivering partial batch") + return True + + return False def make_disposable(self) -> None: """Set this queue as disposable, and never use it again after it gets @@ -218,7 +227,6 @@ def __init__( :param config_loader: Object to load configuration from environment :param worker_type: Type of worker to instantiate to batch inputs :param mem_pool_size: Size of the memory pool used to allocate tensors - :raises SmartSimError: If config_loaded.get_queue() does not return a channel """ super().__init__(as_service=True, cooldown=1) self._queues: dict[str, list[BatchQueue]] = {} @@ -281,7 +289,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool: fs_missing = fs_desired - fs_actual if not self.has_featurestore_factory: - logger.error("No feature store factory configured") + logger.error("No feature store factory is configured. Unable to dispatch.") return False # create the feature stores we need to service request @@ -363,6 +371,7 @@ def _on_iteration(self) -> None: None, ) + logger.debug(f"Dispatcher is processing {len(bytes_list)} messages") request_bytes = bytes_list[0] tensor_bytes_list = bytes_list[1:] self._perf_timer.start_timings() @@ -463,7 +472,7 @@ def dispatch(self, request: InferenceRequest) -> None: ) self._active_queues[tmp_id] = tmp_queue self._queues[tmp_id] = [tmp_queue] - tmp_queue.put_nowait(request) + tmp_queue.put(request) tmp_queue.make_disposable() return diff --git a/smartsim/_core/mli/infrastructure/environment_loader.py b/smartsim/_core/mli/infrastructure/environment_loader.py index 02043fbd80..5ba0fccc27 100644 --- a/smartsim/_core/mli/infrastructure/environment_loader.py +++ b/smartsim/_core/mli/infrastructure/environment_loader.py @@ -39,10 +39,15 @@ class EnvironmentConfigLoader: Facilitates the loading of a FeatureStore and Queue into the WorkerManager. """ + REQUEST_QUEUE_ENV_VAR = "_SMARTSIM_REQUEST_QUEUE" + """The environment variable that holds the request queue descriptor""" + BACKBONE_ENV_VAR = "_SMARTSIM_INFRA_BACKBONE" + """The environment variable that holds the backbone descriptor""" + def __init__( self, featurestore_factory: t.Callable[[str], FeatureStore], - callback_factory: t.Callable[[bytes], CommChannelBase], + callback_factory: t.Callable[[str], CommChannelBase], queue_factory: t.Callable[[str], CommChannelBase], ) -> None: """Initialize the config loader instance with the factories necessary for @@ -76,14 +81,16 @@ def get_backbone(self) -> t.Optional[FeatureStore]: :returns: The attached feature store via `_SMARTSIM_INFRA_BACKBONE` """ - descriptor = os.getenv("_SMARTSIM_INFRA_BACKBONE", "") + descriptor = os.getenv(self.BACKBONE_ENV_VAR, "") if not descriptor: logger.warning("No backbone descriptor is configured") return None if self._featurestore_factory is None: - logger.warning("No feature store factory is configured") + logger.warning( + "No feature store factory is configured. Backbone not created." + ) return None self.backbone = self._featurestore_factory(descriptor) @@ -95,7 +102,7 @@ def get_queue(self) -> t.Optional[CommChannelBase]: :returns: The attached queue specified via `_SMARTSIM_REQUEST_QUEUE` """ - descriptor = os.getenv("_SMARTSIM_REQUEST_QUEUE", "") + descriptor = os.getenv(self.REQUEST_QUEUE_ENV_VAR, "") if not descriptor: logger.warning("No queue descriptor is configured") diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index b6655bded6..b12d7b11b4 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -24,13 +24,10 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import enum -import pickle +import itertools +import os import time import typing as t -import uuid -from collections import defaultdict, deque -from dataclasses import dataclass # pylint: disable=import-error # isort: off @@ -38,7 +35,6 @@ # isort: on -from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) @@ -48,16 +44,29 @@ logger = get_logger(__name__) -# todo: did i create an arms race where a developer just grabs the backbone -# and passes it wherever they need a FeatureStore? class BackboneFeatureStore(DragonFeatureStore): """A DragonFeatureStore wrapper with utility methods for accessing shared information stored in the MLI backbone feature store.""" MLI_NOTIFY_CONSUMERS = "_SMARTSIM_MLI_NOTIFY_CONSUMERS" + """Unique key used in the backbone to locate the consumer list""" + MLI_REGISTRAR_CONSUMER = "_SMARTIM_MLI_REGISTRAR_CONSUMER" + """Unique key used in the backbone to locate the registration consumer""" + MLI_WORKER_QUEUE = "_SMARTSIM_REQUEST_QUEUE" + """Unique key used in the backbone to locate MLI work queue""" + MLI_BACKBONE = "_SMARTSIM_INFRA_BACKBONE" + """Unique key used in the backbone to locate the backbone feature store""" + _CREATED_ON = "creation" + """Unique key used in the backbone to locate the creation date of the + feature store""" + _DEFAULT_WAIT_TIMEOUT = 1.0 + """The default wait time (in seconds) for blocking requests to + the feature store""" def __init__( - self, storage: "dragon_ddict.DDict", allow_reserved_writes: bool = False + self, + storage: dragon_ddict.DDict, + allow_reserved_writes: bool = False, ) -> None: """Initialize the DragonFeatureStore instance. @@ -68,13 +77,33 @@ def __init__( super().__init__(storage) self._enable_reserved_writes = allow_reserved_writes + self._record_creation_data() + + @property + def wait_timeout(self) -> float: + """Retrieve the wait timeout for this feature store. The wait timeout is + applied to all calls to `wait_for`. + + :returns: The wait timeout (in seconds). + """ + return self._wait_timeout + + @wait_timeout.setter + def wait_timeout(self, value: float) -> None: + """Set the wait timeout (in seconds) for this feature store. The wait + timeout is applied to all calls to `wait_for`. + + :param value: The new value to set + """ + self._wait_timeout = value + @property def notification_channels(self) -> t.Sequence[str]: """Retrieve descriptors for all registered MLI notification channels. - :returns: The list of descriptors + :returns: The list of channel descriptors """ - if "_SMARTSIM_MLI_NOTIFY_CONSUMERS" in self: + if self.MLI_NOTIFY_CONSUMERS in self: stored_consumers = self[self.MLI_NOTIFY_CONSUMERS] return str(stored_consumers).split(",") return [] @@ -85,335 +114,146 @@ def notification_channels(self, values: t.Sequence[str]) -> None: :param values: The list of channel descriptors to save """ - self[self.MLI_NOTIFY_CONSUMERS] = ",".join([str(value) for value in values]) - - -class EventCategory(str, enum.Enum): - """Predefined event types raised by SmartSim backend.""" - - CONSUMER_CREATED: str = "consumer-created" - FEATURE_STORE_WRITTEN: str = "feature-store-written" - - -@dataclass -class EventBase: - """Core API for an event.""" - - # todo: shift eventing code to: infrastructure / event / event.py - category: EventCategory - """The event category for this event; may be used for addressing, - prioritization, or filtering of events by a event publisher/consumer""" - - uid: str - """A unique identifier for this event""" - - def __bytes__(self) -> bytes: - """Default conversion to bytes for an event required to publish - messages using byte-oriented communication channels. - - :returns: This entity encoded as bytes""" - return pickle.dumps(self) - - def __str__(self) -> str: - """Convert the event to a string. - - :returns: A string representation of this instance""" - return f"{self.uid}|{self.category}" - - -class OnCreateConsumer(EventBase): - """Publish this event when a new event consumer registration is required.""" - - descriptor: str - """Descriptor of the comm channel exposed by the consumer""" - - def __init__(self, descriptor: str) -> None: - """Initialize the OnCreateConsumer event. - - :param descriptor: Descriptor of the comm channel exposed by the consumer - """ - super().__init__(EventCategory.CONSUMER_CREATED, str(uuid.uuid4())) - self.descriptor = descriptor - - def __str__(self) -> str: - """Convert the event to a string. - - :returns: A string representation of this instance - """ - return f"{str(super())}|{self.descriptor}" - - -class OnWriteFeatureStore(EventBase): - """Publish this event when a feature store key is written.""" - - descriptor: str - """The descriptor of the feature store where the write occurred""" - - key: str - """The key identifying where the write occurred""" - - def __init__(self, descriptor: str, key: str) -> None: - """Initialize the OnWriteFeatureStore event. - - :param descriptor: The descriptor of the feature store where the write occurred - :param key: The key identifying where the write occurred - """ - super().__init__(EventCategory.FEATURE_STORE_WRITTEN, str(uuid.uuid4())) - self.descriptor = descriptor - self.key = key - - def __str__(self) -> str: - """Convert the event to a string. - - :returns: A string representation of this instance - """ - return f"{str(super())}|{self.descriptor}|{self.key}" - - -class EventProducer(t.Protocol): - """Core API of a class that publishes events.""" - - def send(self, event: EventBase, timeout: float = 0.001) -> int: - """The send operation. - - :param event: The event to send - :param timeout: Maximum time to wait (in seconds) for messages to send - """ - - -class EventBroadcaster: - """Performs fan-out publishing of system events.""" - - def __init__( - self, - backbone: BackboneFeatureStore, - channel_factory: t.Optional[t.Callable[[str], CommChannelBase]] = None, - ) -> None: - """Initialize the EventPublisher instance. - - :param backbone: The MLI backbone feature store - :param channel_factory: Factory method to construct new channel instances - """ - self._backbone = backbone - """The backbone feature store used to retrieve consumer descriptors""" - self._channel_factory = channel_factory - """A factory method used to instantiate channels from descriptors""" - self._channel_cache: t.Dict[str, t.Optional[CommChannelBase]] = defaultdict( - lambda: None + self[self.MLI_NOTIFY_CONSUMERS] = ",".join( + [str(value) for value in values if value] ) - """A mapping of instantiated channels that can be re-used. Automatically - calls the channel factory if a descriptor is not already in the collection""" - self._event_buffer: t.Deque[bytes] = deque() - """A buffer for storing events when a consumer list is not found""" - self._descriptors: t.Set[str] - """Stores the most recent list of broadcast consumers. Updated automatically - on each broadcast""" - self._uid = str(uuid.uuid4()) - """A unique identifer assigned to the broadcaster for logging""" @property - def num_buffered(self) -> int: - """Return the number of events currently buffered to send. + def backend_channel(self) -> t.Optional[str]: + """Retrieve the channel descriptor used to register event consumers. - :returns: Number of buffered events - """ - return len(self._event_buffer) + :returns: The channel descriptor""" + if self.MLI_REGISTRAR_CONSUMER in self: + return str(self[self.MLI_REGISTRAR_CONSUMER]) + return None - def _save_to_buffer(self, event: EventBase) -> None: - """Places a serialized event in the buffer to be sent once a consumer - list is available. - - :param event: The event to serialize and buffer - :raises ValueError: If the event cannot be serialized - """ - try: - event_bytes = bytes(event) - self._event_buffer.append(event_bytes) - except Exception as ex: - raise ValueError(f"Unable to serialize event from {self._uid}") from ex - - def _log_broadcast_start(self) -> None: - """Logs broadcast statistics.""" - num_events = len(self._event_buffer) - num_copies = len(self._descriptors) - logger.debug( - f"Broadcast {num_events} events to {num_copies} consumers from {self._uid}" - ) + @backend_channel.setter + def backend_channel(self, value: str) -> None: + """Set the channel used to register event consumers. - def _prune_unused_consumers(self) -> None: - """Performs maintenance on the channel cache by pruning any channel - that has been removed from the consumers list.""" - active_consumers = set(self._descriptors) - current_channels = set(self._channel_cache.keys()) + :param value: The stringified channel descriptor""" + self[self.MLI_REGISTRAR_CONSUMER] = value - # find any cached channels that are now unused - inactive_channels = current_channels.difference(active_consumers) - new_channels = active_consumers.difference(current_channels) + @property + def worker_queue(self) -> t.Optional[str]: + """Retrieve the channel descriptor used to send work to MLI worker managers. - for descriptor in inactive_channels: - self._channel_cache.pop(descriptor) + :returns: The channel descriptor, if found. Otherwise, `None`""" + if self.MLI_WORKER_QUEUE in self: + return str(self[self.MLI_WORKER_QUEUE]) + return None - logger.debug( - f"Pruning {len(inactive_channels)} stale consumers and" - f" found {len(new_channels)} new channels for {self._uid}" - ) + @worker_queue.setter + def worker_queue(self, value: str) -> None: + """Set the channel descriptor used to send work to MLI worker managers. - def _get_comm_channel(self, descriptor: str) -> CommChannelBase: - """Helper method to build and cache a comm channel. + :param value: The channel descriptor""" + self[self.MLI_WORKER_QUEUE] = value - :param descriptor: The descriptor to pass to the channel factory - :returns: The instantiated channel - :raises SmartSimError: If the channel fails to build + @property + def creation_date(self) -> str: + """Return the creation date for the backbone feature store. + + :returns: The string-formatted date when feature store was created""" + return str(self[self._CREATED_ON]) + + def _record_creation_data(self) -> None: + """Write the creation timestamp to the feature store.""" + if self._CREATED_ON not in self: + if not self._allow_reserved_writes: + logger.warning( + "Recorded creation from a write-protected backbone instance" + ) + self[self._CREATED_ON] = str(time.time()) + + os.environ[self.MLI_BACKBONE] = self.descriptor + + @classmethod + def from_writable_descriptor( + cls, + descriptor: str, + ) -> "BackboneFeatureStore": + """A factory method that creates an instance from a descriptor string. + + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached DragonFeatureStore + :raises SmartSimError: if attachment to DragonFeatureStore fails """ - comm_channel = self._channel_cache[descriptor] - if comm_channel is not None: - return comm_channel - - if self._channel_factory is None: - raise SmartSimError("No channel factory provided for consumers") - try: - channel = self._channel_factory(descriptor) - self._channel_cache[descriptor] = channel - return channel + return BackboneFeatureStore(dragon_ddict.DDict.attach(descriptor), True) except Exception as ex: - msg = f"Unable to construct channel with descriptor: {descriptor}" - logger.error(msg, exc_info=True) - raise SmartSimError(msg) from ex + raise SmartSimError( + f"Error creating backbone feature store: {descriptor}" + ) from ex - def _broadcast(self, timeout: float = 0.001) -> int: - """Broadcasts all buffered events to registered event consumers. + def _check_wait_timeout( + self, start_time: float, timeout: float, indicators: t.Dict[str, bool] + ) -> None: + """Perform timeout verification. - :param timeout: Maximum time to wait (in seconds) for messages to send - :returns: The number of events broadcasted to consumers - :raises SmartSimError: If broadcasting fails + :param start_time: the start time to use for elapsed calculation + :param timeout: the timeout (in seconds) + :param indicators: latest retrieval status for requested keys + :raises SmartSimError: If the timeout elapses before all values are + retrieved """ - # allow descriptors to be empty since events are buffered - self._descriptors = set(x for x in self._backbone.notification_channels if x) - if not self._descriptors: - logger.warning(f"No event consumers are registered for {self._uid}") - return 0 - - self._prune_unused_consumers() - self._log_broadcast_start() - - num_sent: int = 0 - next_event: t.Optional[bytes] = self._event_buffer.popleft() - - # send each event to every consumer - while next_event is not None: - for descriptor in map(str, self._descriptors): - comm_channel = self._get_comm_channel(descriptor) - - try: - # todo: given a failure, the message is not sent to any other - # recipients. consider retrying, adding a dead letter queue, or - # logging the message details more intentionally - comm_channel.send(next_event, timeout) - num_sent += 1 - except Exception as ex: - raise SmartSimError( - f"Failed broadcast to channel {descriptor} from {self._uid}" - ) from ex - - try: - next_event = self._event_buffer.popleft() - except IndexError: - next_event = None - logger.debug(f"Broadcast buffer exhausted for {self._uid}") - - return num_sent - - def send(self, event: EventBase, timeout: float = 0.001) -> int: - """Implementation of `send` method of the `EventPublisher` protocol. Publishes - the supplied event to all registered broadcast consumers. - - :param event: An event to publish - :param timeout: Maximum time to wait (in seconds) for messages to send - :returns: The number of events successfully published - :raises ValueError: If event serialization fails - :raises KeyError: If channel fails to attach using registered descriptors - :raises SmartSimError: If any unexpected error occurs during send + elapsed = time.time() - start_time + if timeout and elapsed > timeout: + raise SmartSimError( + f"Backbone {self.descriptor=} timeout after {elapsed} " + f"seconds retrieving keys: {indicators}" + ) + + def wait_for( + self, keys: t.List[str], timeout: float = _DEFAULT_WAIT_TIMEOUT + ) -> t.Dict[str, t.Union[str, bytes, None]]: + """Perform a blocking wait until all specified keys have been found + in the backbone. + + :param keys: The required collection of keys to retrieve + :param timeout: The maximum wait time in seconds + :returns: Dictionary containing the keys and values requested + :raises SmartSimError: If the timeout elapses without retrieving + all requested keys """ - try: - self._save_to_buffer(event) - return self._broadcast(timeout) - except (KeyError, ValueError, SmartSimError): - raise - except Exception as ex: - raise SmartSimError("An unexpected failure occurred while sending") from ex + if timeout < 0: + timeout = self._DEFAULT_WAIT_TIMEOUT + logger.info(f"Using default wait_for timeout: {timeout}s") + if not keys: + return {} -class EventConsumer: - """Reads system events published to a communications channel.""" + values: t.Dict[str, t.Union[str, bytes, None]] = {k: None for k in set(keys)} + is_found = {k: False for k in values.keys()} - def __init__( - self, - comm_channel: CommChannelBase, - backbone: BackboneFeatureStore, - filters: t.Optional[t.List[EventCategory]] = None, - batch_timeout: t.Optional[float] = None, - ) -> None: - """Initialize the EventConsumer instance. - - :param comm_channel: Communications channel to listen to for events - :param backbone: The MLI backbone feature store - :param filters: A list of event types to deliver. when empty, all - events will be delivered - :param timeout: Maximum time to wait for messages to arrive; may be overridden - on individual calls to `receive` - :raises ValueError: If batch_timeout <= 0 - """ - if batch_timeout is not None and batch_timeout <= 0: - raise ValueError("batch_timeout must be a non-zero, positive value") - - self._comm_channel = comm_channel - self._backbone = backbone - self._global_filters = filters or [] - self._global_timeout = batch_timeout or 1.0 - - def receive( - self, filters: t.Optional[t.List[EventCategory]] = None, timeout: float = 0 - ) -> t.List[EventBase]: - """Receives available published event(s). - - :param filters: Additional filters to add to the global filters configured - on the EventConsumer instance - :param timeout: Maximum time to wait for messages to arrive - :returns: A list of events that pass any configured filters - """ - if filters is None: - filters = [] - - filter_set = {*self._global_filters, *filters} - messages: t.List[t.Any] = [] + backoff = (0.1, 0.2, 0.4, 0.8) + backoff_iter = itertools.cycle(backoff) + start_time = time.time() - # use the local timeout to override a global setting - start_at = time.time_ns() + while not all(is_found.values()): + delay = next(backoff_iter) - while msg_bytes_list := self._comm_channel.recv(timeout=timeout): - # remove any empty messages that will fail to decode - msg_bytes_list = [msg for msg in msg_bytes_list if msg] + for key in [k for k, v in is_found.items() if not v]: + try: + values[key] = self[key] + is_found[key] = True + except Exception: + if delay == backoff[-1]: + logger.debug(f"Re-attempting `{key}` retrieval in {delay}s") - msg: t.Optional[EventBase] = None - if msg_bytes_list: - for message in msg_bytes_list: - msg = pickle.loads(message) + if all(is_found.values()): + logger.debug(f"wait_for({keys}) retrieved all keys") + continue - if not msg: - logger.warning("Unable to unpickle message") - continue + self._check_wait_timeout(start_time, timeout, is_found) + time.sleep(delay) - # ignore anything that doesn't match a filter (if one is - # supplied), otherwise return everything - if not filter_set or msg.category in filter_set: - messages.append(msg) + return values - # avoid getting stuck indefinitely waiting for the channel - elapsed = (time.time_ns() - start_at) / 1000000000 - remaining = elapsed - self._global_timeout - if remaining > 0: - logger.debug(f"Consumer batch timeout exceeded by: {abs(remaining)}") - break + def get_env(self) -> t.Dict[str, str]: + """Returns a dictionary populated with environment variables necessary to + connect a process to the existing backbone instance. - return messages + :returns: The dictionary populated with env vars + """ + return {self.MLI_BACKBONE: self.descriptor} diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py index d7b37ffe61..24f2221c87 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py @@ -32,6 +32,10 @@ # isort: on +from smartsim._core.mli.infrastructure.storage.dragon_util import ( + ddict_to_descriptor, + descriptor_to_ddict, +) from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore from smartsim.error import SmartSimError from smartsim.log import get_logger @@ -46,15 +50,20 @@ def __init__(self, storage: "dragon_ddict.DDict") -> None: """Initialize the DragonFeatureStore instance. :param storage: A distributed dictionary to be used as the underlying - storage mechanism of the feature store - """ + storage mechanism of the feature store""" + if storage is None: + raise ValueError( + "Storage is required when instantiating a DragonFeatureStore." + ) + + descriptor = "" if isinstance(storage, dragon_ddict.DDict): - descriptor = str(storage.serialize()) - else: - descriptor = "not-set" + descriptor = ddict_to_descriptor(storage) super().__init__(descriptor) self._storage: t.Dict[str, t.Union[str, bytes]] = storage + """The underlying storage mechanism of the DragonFeatureStore; a + distributed, in-memory key-value store""" def _get(self, key: str) -> t.Union[str, bytes]: """Retrieve a value from the underlying storage mechanism. @@ -65,7 +74,7 @@ def _get(self, key: str) -> t.Union[str, bytes]: """ try: return self._storage[key] - except KeyError as e: + except dragon_ddict.DDictError as e: raise KeyError(f"Key not found in FeatureStore: {key}") from e def _set(self, key: str, value: t.Union[str, bytes]) -> None: @@ -85,6 +94,17 @@ def _contains(self, key: str) -> bool: """ return key in self._storage + def pop(self, key: str) -> t.Union[str, bytes, None]: + """Remove the value from the dictionary and return the value. + + :param key: Dictionary key to retrieve + :returns: The value held at the key if it exists, otherwise `None + `""" + try: + return self._storage.pop(key) + except dragon_ddict.DDictError: + return None + @classmethod def from_descriptor( cls, @@ -97,9 +117,10 @@ def from_descriptor( :raises SmartSimError: If attachment to DragonFeatureStore fails """ try: - return DragonFeatureStore(dragon_ddict.DDict.attach(descriptor)) + logger.debug(f"Attaching to FeatureStore with descriptor: {descriptor}") + storage = descriptor_to_ddict(descriptor) + return cls(storage) except Exception as ex: - logger.error(f"Error creating dragon feature store: {descriptor}") raise SmartSimError( - f"Error creating dragon feature store: {descriptor}" + f"Error creating dragon feature store from descriptor: {descriptor}" ) from ex diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_util.py b/smartsim/_core/mli/infrastructure/storage/dragon_util.py new file mode 100644 index 0000000000..50d15664c0 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/storage/dragon_util.py @@ -0,0 +1,101 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# pylint: disable=import-error +# isort: off +import dragon.data.ddict.ddict as dragon_ddict + +# isort: on + +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +def ddict_to_descriptor(ddict: dragon_ddict.DDict) -> str: + """Convert a DDict to a descriptor string. + + :param ddict: The dragon dictionary to convert + :returns: The descriptor string + :raises ValueError: If a ddict is not provided + """ + if ddict is None: + raise ValueError("DDict is not available to create a descriptor") + + # unlike other dragon objects, the dictionary serializes to a string + # instead of bytes + return str(ddict.serialize()) + + +def descriptor_to_ddict(descriptor: str) -> dragon_ddict.DDict: + """Create and attach a new DDict instance given + the string-encoded descriptor. + + :param descriptor: The descriptor of a dictionary to attach to + :returns: The attached dragon dictionary""" + return dragon_ddict.DDict.attach(descriptor) + + +def create_ddict( + num_nodes: int, mgr_per_node: int, mem_per_node: int +) -> dragon_ddict.DDict: + """Create a distributed dragon dictionary. + + :param num_nodes: The number of distributed nodes to distribute the dictionary to. + At least one node is required. + :param mgr_per_node: The number of manager processes per node + :param mem_per_node: The amount of memory (in megabytes) to allocate per node. Total + memory available will be calculated as `num_nodes * node_mem` + + :returns: The instantiated dragon dictionary + :raises ValueError: If invalid num_nodes is supplied + :raises ValueError: If invalid mem_per_node is supplied + :raises ValueError: If invalid mgr_per_node is supplied + """ + if num_nodes < 1: + raise ValueError("A dragon dictionary must have at least 1 node") + + if mgr_per_node < 1: + raise ValueError("A dragon dict requires at least 2 managers per ndode") + + if mem_per_node < dragon_ddict.DDICT_MIN_SIZE: + raise ValueError( + "A dragon dictionary requires at least " + f"{dragon_ddict.DDICT_MIN_SIZE / 1024} MB" + ) + + mem_total = num_nodes * mem_per_node + + logger.debug( + f"Creating dragon dictionary with {num_nodes} nodes, {mem_total} MB memory" + ) + + distributed_dict = dragon_ddict.DDict(num_nodes, mgr_per_node, total_mem=mem_total) + logger.debug( + "Successfully created dragon dictionary with " + f"{num_nodes} nodes, {mem_total} MB total memory" + ) + return distributed_dict diff --git a/smartsim/_core/mli/infrastructure/storage/feature_store.py b/smartsim/_core/mli/infrastructure/storage/feature_store.py index a55c523058..ebca07ed4e 100644 --- a/smartsim/_core/mli/infrastructure/storage/feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/feature_store.py @@ -43,6 +43,14 @@ class ReservedKeys(str, enum.Enum): """Storage location for the list of registered consumers that will receive events from an EventBroadcaster""" + MLI_REGISTRAR_CONSUMER = "_SMARTIM_MLI_REGISTRAR_CONSUMER" + """Storage location for the channel used to send messages directly to + the MLI backend""" + + MLI_WORKER_QUEUE = "_SMARTSIM_REQUEST_QUEUE" + """Storage location for the channel used to send work requests + to the available worker managers""" + @classmethod def contains(cls, value: str) -> bool: """Convert a string representation into an enumeration member. @@ -59,7 +67,27 @@ def contains(cls, value: str) -> bool: @dataclass(frozen=True) -class FeatureStoreKey: +class TensorKey: + """A key,descriptor pair enabling retrieval of an item from a feature store.""" + + key: str + """The unique key of an item in a feature store""" + descriptor: str + """The unique identifier of the feature store containing the key""" + + def __post_init__(self) -> None: + """Ensure the key and descriptor have at least one character. + + :raises ValueError: If key or descriptor are empty strings + """ + if len(self.key) < 1: + raise ValueError("Key must have at least one character.") + if len(self.descriptor) < 1: + raise ValueError("Descriptor must have at least one character.") + + +@dataclass(frozen=True) +class ModelKey: """A key,descriptor pair enabling retrieval of an item from a feature store.""" key: str @@ -119,8 +147,8 @@ def __getitem__(self, key: str) -> t.Union[str, bytes]: """ try: return self._get(key) - except KeyError as ex: - raise SmartSimError(f"An unknown key was requested: {key}") from ex + except KeyError: + raise except Exception as ex: # note: explicitly avoid round-trip to check for key existence raise SmartSimError( diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 530d251540..9556b8e438 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -39,17 +39,16 @@ from ...comm.channel.channel import CommChannelBase from ...message_handler import MessageHandler from ...mli_schemas.model.model_capnp import Model -from ..storage.feature_store import FeatureStore, FeatureStoreKey +from ..storage.feature_store import FeatureStore, ModelKey, TensorKey if t.TYPE_CHECKING: - from smartsim._core.mli.mli_schemas.data.data_references_capnp import TensorKey from smartsim._core.mli.mli_schemas.response.response_capnp import Status from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor logger = get_logger(__name__) # Placeholder -ModelIdentifier = FeatureStoreKey +ModelIdentifier = ModelKey class InferenceRequest: @@ -57,12 +56,12 @@ class InferenceRequest: def __init__( self, - model_key: t.Optional[FeatureStoreKey] = None, + model_key: t.Optional[ModelKey] = None, callback: t.Optional[CommChannelBase] = None, raw_inputs: t.Optional[t.List[bytes]] = None, - input_keys: t.Optional[t.List[FeatureStoreKey]] = None, + input_keys: t.Optional[t.List[TensorKey]] = None, input_meta: t.Optional[t.List[t.Any]] = None, - output_keys: t.Optional[t.List[FeatureStoreKey]] = None, + output_keys: t.Optional[t.List[TensorKey]] = None, raw_model: t.Optional[Model] = None, batch_size: int = 0, ): @@ -112,7 +111,7 @@ def has_model_key(self) -> bool: @property def has_raw_inputs(self) -> bool: - """Check if the InferenceRequest contains raw_outputs. + """Check if the InferenceRequest contains raw_inputs. :returns: True if raw_outputs is not None and is not an empty list, False otherwise @@ -153,7 +152,7 @@ class InferenceReply: def __init__( self, outputs: t.Optional[t.Collection[t.Any]] = None, - output_keys: t.Optional[t.Collection[FeatureStoreKey]] = None, + output_keys: t.Optional[t.Collection[TensorKey]] = None, status_enum: "Status" = "running", message: str = "In progress", ) -> None: @@ -166,7 +165,7 @@ def __init__( """ self.outputs: t.Collection[t.Any] = outputs or [] """List of output data""" - self.output_keys: t.Collection[t.Optional[FeatureStoreKey]] = output_keys or [] + self.output_keys: t.Collection[t.Optional[TensorKey]] = output_keys or [] """List of keys used for output data""" self.status_enum = status_enum """Status of the reply""" @@ -201,6 +200,7 @@ def __init__(self, model: t.Any) -> None: :param model: The loaded model """ self.model = model + """The loaded model (e.g. a TensorFlow, PyTorch, ONNX, etc. model)""" class TransformInputResult: @@ -320,7 +320,7 @@ class RequestBatch: """List of InferenceRequests in the batch""" inputs: t.Optional[TransformInputResult] """Transformed batch of input tensors""" - model_id: ModelIdentifier + model_id: "ModelIdentifier" """Model (key, descriptor) tuple""" @property @@ -350,7 +350,7 @@ def raw_model(self) -> t.Optional[t.Any]: return None @property - def input_keys(self) -> t.List[FeatureStoreKey]: + def input_keys(self) -> t.List[TensorKey]: """All input keys available in this batch's requests. :returns: All input keys belonging to requests in this batch""" @@ -361,7 +361,7 @@ def input_keys(self) -> t.List[FeatureStoreKey]: return keys @property - def output_keys(self) -> t.List[FeatureStoreKey]: + def output_keys(self) -> t.List[TensorKey]: """All output keys available in this batch's requests. :returns: All output keys belonging to requests in this batch""" @@ -378,7 +378,7 @@ class MachineLearningWorkerCore: @staticmethod def deserialize_message( data_blob: bytes, - callback_factory: t.Callable[[bytes], CommChannelBase], + callback_factory: t.Callable[[str], CommChannelBase], ) -> InferenceRequest: """Deserialize a message from a byte stream into an InferenceRequest. @@ -388,27 +388,27 @@ def deserialize_message( :returns: The raw input message deserialized into an InferenceRequest """ request = MessageHandler.deserialize_request(data_blob) - model_key: t.Optional[FeatureStoreKey] = None + model_key: t.Optional[ModelKey] = None model_bytes: t.Optional[Model] = None if request.model.which() == "key": - model_key = FeatureStoreKey( + model_key = ModelKey( key=request.model.key.key, - descriptor=request.model.key.featureStoreDescriptor, + descriptor=request.model.key.descriptor, ) elif request.model.which() == "data": model_bytes = request.model.data callback_key = request.replyChannel.descriptor comm_channel = callback_factory(callback_key) - input_keys: t.Optional[t.List[FeatureStoreKey]] = None + input_keys: t.Optional[t.List[TensorKey]] = None input_bytes: t.Optional[t.List[bytes]] = None - output_keys: t.Optional[t.List[FeatureStoreKey]] = None + output_keys: t.Optional[t.List[TensorKey]] = None input_meta: t.Optional[t.List[TensorDescriptor]] = None if request.input.which() == "keys": input_keys = [ - FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor) + TensorKey(key=value.key, descriptor=value.descriptor) for value in request.input.keys ] elif request.input.which() == "descriptors": @@ -416,7 +416,7 @@ def deserialize_message( if request.output: output_keys = [ - FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor) + TensorKey(key=value.key, descriptor=value.descriptor) for value in request.output ] @@ -490,7 +490,7 @@ def fetch_model( feature_store = feature_stores[fsd] raw_bytes: bytes = t.cast(bytes, feature_store[key]) return FetchModelResult(raw_bytes) - except FileNotFoundError as ex: + except (FileNotFoundError, KeyError) as ex: logger.exception(ex) raise SmartSimError(f"Model could not be retrieved with key {key}") from ex @@ -545,12 +545,12 @@ def place_output( request: InferenceRequest, transform_result: TransformOutputResult, feature_stores: t.Dict[str, FeatureStore], - ) -> t.Collection[t.Optional[FeatureStoreKey]]: + ) -> t.Collection[t.Optional[TensorKey]]: """Given a collection of data, make it available as a shared resource in the feature store. :param request: The request that triggered the pipeline - :param execute_result: Results from inference + :param transform_result: Transformed version of the inference result :param feature_stores: Available feature stores used for persistence :returns: A collection of keys that were placed in the feature store :raises ValueError: If a feature store is not provided @@ -558,7 +558,7 @@ def place_output( if not feature_stores: raise ValueError("Feature store is required for output persistence") - keys: t.List[t.Optional[FeatureStoreKey]] = [] + keys: t.List[t.Optional[TensorKey]] = [] # need to decide how to get back to original sub-batch inputs so they can be # accurately placed, datum might need to include this. @@ -580,10 +580,12 @@ class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC): def load_model( batch: RequestBatch, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: - """Given a loaded MachineLearningModel, ensure it is loaded into - device memory. + """Given the raw bytes of an ML model that were fetched, ensure + it is loaded into device memory. :param request: The request that triggered the pipeline + :param fetch_result: The result of a fetch-model operation; contains + the raw bytes of the ML model. :param device: The device on which the model must be placed :returns: LoadModelResult wrapping the model loaded for the request :raises ValueError: If model reference object is not found @@ -600,7 +602,7 @@ def transform_input( """Given a collection of data, perform a transformation on the data and put the raw tensor data on a MemoryPool allocation. - :param request: The request that triggered the pipeline + :param batch: The request that triggered the pipeline :param fetch_result: Raw outputs from fetching inputs out of a feature store :param mem_pool: The memory pool used to access batched input tensors :returns: The transformed inputs wrapped in a TransformInputResult diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index 71def143ad..e3d46a7ab3 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -35,6 +35,10 @@ class MessageHandler: + """Utility methods for transforming capnproto messages to and from + internal representations. + """ + @staticmethod def build_tensor_descriptor( order: "tensor_capnp.Order", @@ -73,7 +77,7 @@ def build_output_tensor_descriptor( order, data type, and dimensions. :param order: Order of the tensor, such as row-major (c) or column-major (f) - :param keys: List of TensorKeys to apply transorm descriptor to + :param keys: List of TensorKey to apply transorm descriptor to :param data_type: Tranform data type of the tensor :param dimensions: Transform dimensions of the tensor :returns: The OutputDescriptor @@ -92,14 +96,12 @@ def build_output_tensor_descriptor( return description @staticmethod - def build_tensor_key( - key: str, feature_store_descriptor: str - ) -> data_references_capnp.TensorKey: + def build_tensor_key(key: str, descriptor: str) -> data_references_capnp.TensorKey: """ Builds a new TensorKey message with the provided key. :param key: String to set the TensorKey - :param feature_store_descriptor: A descriptor identifying the feature store + :param descriptor: A descriptor identifying the feature store containing the key :returns: The TensorKey :raises ValueError: If building fails @@ -107,7 +109,7 @@ def build_tensor_key( try: tensor_key = data_references_capnp.TensorKey.new_message() tensor_key.key = key - tensor_key.featureStoreDescriptor = feature_store_descriptor + tensor_key.descriptor = descriptor except Exception as e: raise ValueError("Error building tensor key.") from e return tensor_key @@ -133,14 +135,12 @@ def build_model(data: bytes, name: str, version: str) -> model_capnp.Model: return model @staticmethod - def build_model_key( - key: str, feature_store_descriptor: str - ) -> data_references_capnp.ModelKey: + def build_model_key(key: str, descriptor: str) -> data_references_capnp.ModelKey: """ Builds a new ModelKey message with the provided key. :param key: String to set the ModelKey - :param feature_store_descriptor: A descriptor identifying the feature store + :param descriptor: A descriptor identifying the feature store containing the key :returns: The ModelKey :raises ValueError: If building fails @@ -148,9 +148,9 @@ def build_model_key( try: model_key = data_references_capnp.ModelKey.new_message() model_key.key = key - model_key.featureStoreDescriptor = feature_store_descriptor + model_key.descriptor = descriptor except Exception as e: - raise ValueError("Error building model key.") from e + raise ValueError("Error building tensor key.") from e return model_key @staticmethod @@ -242,7 +242,7 @@ def _assign_model( @staticmethod def _assign_reply_channel( - request: request_capnp.Request, reply_channel: bytes + request: request_capnp.Request, reply_channel: str ) -> None: """ Assigns a reply channel to the supplied request. @@ -360,7 +360,7 @@ def _assign_custom_request_attributes( @staticmethod def build_request( - reply_channel: bytes, + reply_channel: str, model: t.Union[data_references_capnp.ModelKey, model_capnp.Model], inputs: t.Union[ t.List[data_references_capnp.TensorKey], diff --git a/smartsim/_core/mli/mli_schemas/data/data_references.capnp b/smartsim/_core/mli/mli_schemas/data/data_references.capnp index 699abe5d22..65293be7b2 100644 --- a/smartsim/_core/mli/mli_schemas/data/data_references.capnp +++ b/smartsim/_core/mli/mli_schemas/data/data_references.capnp @@ -28,10 +28,10 @@ struct ModelKey { key @0 :Text; - featureStoreDescriptor @1 :Text; + descriptor @1 :Text; } struct TensorKey { key @0 :Text; - featureStoreDescriptor @1 :Text; + descriptor @1 :Text; } diff --git a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi index bcf53e0a04..a5e318a556 100644 --- a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi @@ -36,7 +36,7 @@ from typing import Iterator class ModelKey: key: str - featureStoreDescriptor: str + descriptor: str @staticmethod @contextmanager def from_bytes( @@ -72,7 +72,7 @@ class ModelKeyBuilder(ModelKey): class TensorKey: key: str - featureStoreDescriptor: str + descriptor: str @staticmethod @contextmanager def from_bytes( diff --git a/smartsim/_core/mli/mli_schemas/request/request.capnp b/smartsim/_core/mli/mli_schemas/request/request.capnp index 4be1cfa215..26d9542d9f 100644 --- a/smartsim/_core/mli/mli_schemas/request/request.capnp +++ b/smartsim/_core/mli/mli_schemas/request/request.capnp @@ -32,7 +32,7 @@ using DataRef = import "../data/data_references.capnp"; using Models = import "../model/model.capnp"; struct ChannelDescriptor { - descriptor @0 :Data; + descriptor @0 :Text; } struct Request { diff --git a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi index a4ad631f9f..2aab80b1d0 100644 --- a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi @@ -61,7 +61,7 @@ from .request_attributes.request_attributes_capnp import ( ) class ChannelDescriptor: - descriptor: bytes + descriptor: str @staticmethod @contextmanager def from_bytes( diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py index 114db88d90..f99950739e 100644 --- a/smartsim/_core/utils/timings.py +++ b/smartsim/_core/utils/timings.py @@ -145,10 +145,12 @@ def max_length(self) -> int: return max(len(value) for value in self._timings.values()) def print_timings(self, to_file: bool = False) -> None: - """Print all timing information + """Print timing information to standard output. If `to_file` + is `True`, also write results to a file. - :param to_file: flag indicating if timing should be written to stdout - or to the timing file""" + :param to_file: If `True`, also saves timing information + to the files `timings.npy` and `timings.txt` + """ print(" ".join(self._timings.keys())) try: value_array = np.array(list(self._timings.values()), dtype=float) diff --git a/smartsim/log.py b/smartsim/log.py index 3d6c0860ee..c8fed9329f 100644 --- a/smartsim/log.py +++ b/smartsim/log.py @@ -252,16 +252,21 @@ def filter(self, record: logging.LogRecord) -> bool: return record.levelno <= level_no -def log_to_file(filename: str, log_level: str = "debug") -> None: +def log_to_file( + filename: str, log_level: str = "debug", logger: t.Optional[logging.Logger] = None +) -> None: """Installs a second filestream handler to the root logger, allowing subsequent logging calls to be sent to filename. - :param filename: the name of the desired log file. - :param log_level: as defined in get_logger. Can be specified + :param filename: The name of the desired log file. + :param log_level: As defined in get_logger. Can be specified to allow the file to store more or less verbose logging information. + :param logger: If supplied, a logger to add the file stream logging + behavior to. By default, a new logger is instantiated. """ - logger = logging.getLogger("SmartSim") + if logger is None: + logger = logging.getLogger("SmartSim") stream = open( # pylint: disable=consider-using-with filename, "w+", encoding="utf-8" ) diff --git a/tests/dragon/channel.py b/tests/dragon/channel.py index 2348784236..4c46359c2d 100644 --- a/tests/dragon/channel.py +++ b/tests/dragon/channel.py @@ -39,17 +39,15 @@ class FileSystemCommChannel(CommChannelBase): """Passes messages by writing to a file""" - def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: - """Initialize the FileSystemCommChannel instance + def __init__(self, key: pathlib.Path) -> None: + """Initialize the FileSystemCommChannel instance. - :param key: a path to the root directory of the feature store""" + :param key: a path to the root directory of the feature store + """ self._lock = threading.RLock() - if isinstance(key, pathlib.Path): - super().__init__(key.as_posix().encode("utf-8")) - self._file_path = key - else: - super().__init__(key) - self._file_path = pathlib.Path(key.decode("utf-8")) + + super().__init__(key.as_posix()) + self._file_path = key if not self._file_path.parent.exists(): self._file_path.parent.mkdir(parents=True) @@ -57,10 +55,11 @@ def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: self._file_path.touch() def send(self, value: bytes, timeout: float = 0) -> None: - """Send a message throuh the underlying communication channel + """Send a message throuh the underlying communication channel. + :param value: The value to send :param timeout: maximum time to wait (in seconds) for messages to send - :param value: The value to send""" + """ with self._lock: # write as text so we can add newlines as delimiters with open(self._file_path, "a") as fp: @@ -69,11 +68,12 @@ def send(self, value: bytes, timeout: float = 0) -> None: logger.debug(f"FileSystemCommChannel {self._file_path} sent message") def recv(self, timeout: float = 0) -> t.List[bytes]: - """Receives message(s) through the underlying communication channel + """Receives message(s) through the underlying communication channel. :param timeout: maximum time to wait (in seconds) for messages to arrive :returns: the received message - :raises SmartSimError: if the descriptor points to a missing file""" + :raises SmartSimError: if the descriptor points to a missing file + """ with self._lock: messages: t.List[bytes] = [] if not self._file_path.exists(): @@ -102,7 +102,7 @@ def recv(self, timeout: float = 0) -> t.List[bytes]: return messages def clear(self) -> None: - """Create an empty file for events""" + """Create an empty file for events.""" if self._file_path.exists(): self._file_path.unlink() self._file_path.touch() @@ -110,17 +110,15 @@ def clear(self) -> None: @classmethod def from_descriptor( cls, - descriptor: t.Union[str, bytes], + descriptor: str, ) -> "FileSystemCommChannel": - """A factory method that creates an instance from a descriptor string + """A factory method that creates an instance from a descriptor string. :param descriptor: The descriptor that uniquely identifies the resource - :returns: An attached FileSystemCommChannel""" + :returns: An attached FileSystemCommChannel + """ try: - if isinstance(descriptor, str): - path = pathlib.Path(descriptor) - else: - path = pathlib.Path(descriptor.decode("utf-8")) + path = pathlib.Path(descriptor) return FileSystemCommChannel(path) except: logger.warning(f"failed to create fs comm channel: {descriptor}") diff --git a/tests/dragon/conftest.py b/tests/dragon/conftest.py new file mode 100644 index 0000000000..d542700175 --- /dev/null +++ b/tests/dragon/conftest.py @@ -0,0 +1,129 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import os +import pathlib +import socket +import subprocess +import sys +import typing as t + +import pytest + +dragon = pytest.importorskip("dragon") + +# isort: off +import dragon.data.ddict.ddict as dragon_ddict +import dragon.infrastructure.policy as dragon_policy +import dragon.infrastructure.process_desc as dragon_process_desc +import dragon.native.process as dragon_process + +from dragon.fli import FLInterface + +# isort: on + +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local +from smartsim._core.mli.infrastructure.storage import dragon_util +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +@pytest.fixture(scope="module") +def the_storage() -> dragon_ddict.DDict: + """Fixture to instantiate a dragon distributed dictionary.""" + return dragon_util.create_ddict(1, 2, 32 * 1024**2) + + +@pytest.fixture(scope="module") +def the_worker_channel() -> DragonFLIChannel: + """Fixture to create a valid descriptor for a worker channel + that can be attached to.""" + channel_ = create_local() + fli_ = FLInterface(main_ch=channel_, manager_ch=None) + comm_channel = DragonFLIChannel(fli_) + return comm_channel + + +@pytest.fixture(scope="module") +def the_backbone( + the_storage: t.Any, the_worker_channel: DragonFLIChannel +) -> BackboneFeatureStore: + """Fixture to create a distributed dragon dictionary and wrap it + in a BackboneFeatureStore. + + :param the_storage: The dragon storage engine to use + :param the_worker_channel: Pre-configured worker channel + """ + + backbone = BackboneFeatureStore(the_storage, allow_reserved_writes=True) + backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor + + return backbone + + +@pytest.fixture(scope="module") +def backbone_descriptor(the_backbone: BackboneFeatureStore) -> str: + # create a shared backbone featurestore + return the_backbone.descriptor + + +def function_as_dragon_proc( + entrypoint_fn: t.Callable[[t.Any], None], + args: t.List[t.Any], + cpu_affinity: t.List[int], + gpu_affinity: t.List[int], +) -> dragon_process.Process: + """Execute a function as an independent dragon process. + + :param entrypoint_fn: The function to execute + :param args: The arguments for the entrypoint function + :param cpu_affinity: The cpu affinity for the process + :param gpu_affinity: The gpu affinity for the process + :returns: The dragon process handle + """ + options = dragon_process_desc.ProcessOptions(make_inf_channels=True) + local_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=socket.gethostname(), + cpu_affinity=cpu_affinity, + gpu_affinity=gpu_affinity, + ) + return dragon_process.Process( + target=entrypoint_fn, + args=args, + cwd=os.getcwd(), + policy=local_policy, + options=options, + stderr=dragon_process.Popen.STDOUT, + stdout=dragon_process.Popen.STDOUT, + ) diff --git a/tests/dragon/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py index ed9ac625cd..e9c356b4e0 100644 --- a/tests/dragon/test_core_machine_learning_worker.py +++ b/tests/dragon/test_core_machine_learning_worker.py @@ -34,7 +34,7 @@ import torch import smartsim.error as sse -from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey +from smartsim._core.mli.infrastructure.storage.feature_store import ModelKey, TensorKey from smartsim._core.mli.infrastructure.worker.worker import ( InferenceRequest, MachineLearningWorkerCore, @@ -98,7 +98,7 @@ def test_fetch_model_disk(persist_torch_model: pathlib.Path, test_dir: str) -> N fsd = feature_store.descriptor feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes() - model_key = FeatureStoreKey(key=key, descriptor=fsd) + model_key = ModelKey(key=key, descriptor=fsd) request = InferenceRequest(model_key=model_key) batch = RequestBatch([request], None, model_key) @@ -116,7 +116,7 @@ def test_fetch_model_disk_missing() -> None: key = "/path/that/doesnt/exist" - model_key = FeatureStoreKey(key=key, descriptor=fsd) + model_key = ModelKey(key=key, descriptor=fsd) request = InferenceRequest(model_key=model_key) batch = RequestBatch([request], None, model_key) @@ -141,7 +141,7 @@ def test_fetch_model_feature_store(persist_torch_model: pathlib.Path) -> None: fsd = feature_store.descriptor feature_store[key] = persist_torch_model.read_bytes() - model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + model_key = ModelKey(key=key, descriptor=feature_store.descriptor) request = InferenceRequest(model_key=model_key) batch = RequestBatch([request], None, model_key) @@ -159,7 +159,7 @@ def test_fetch_model_feature_store_missing() -> None: feature_store = MemoryFeatureStore() fsd = feature_store.descriptor - model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + model_key = ModelKey(key=key, descriptor=feature_store.descriptor) request = InferenceRequest(model_key=model_key) batch = RequestBatch([request], None, model_key) @@ -182,7 +182,7 @@ def test_fetch_model_memory(persist_torch_model: pathlib.Path) -> None: fsd = feature_store.descriptor feature_store[key] = persist_torch_model.read_bytes() - model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + model_key = ModelKey(key=key, descriptor=feature_store.descriptor) request = InferenceRequest(model_key=model_key) batch = RequestBatch([request], None, model_key) @@ -199,11 +199,9 @@ def test_fetch_input_disk(persist_torch_tensor: pathlib.Path) -> None: feature_store = MemoryFeatureStore() fsd = feature_store.descriptor - request = InferenceRequest( - input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)] - ) + request = InferenceRequest(input_keys=[TensorKey(key=tensor_name, descriptor=fsd)]) - model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + model_key = ModelKey(key="test-model", descriptor=fsd) batch = RequestBatch([request], None, model_key) worker = MachineLearningWorkerCore @@ -223,9 +221,9 @@ def test_fetch_input_disk_missing() -> None: fsd = feature_store.descriptor key = "/path/that/doesnt/exist" - request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) + request = InferenceRequest(input_keys=[TensorKey(key=key, descriptor=fsd)]) - model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + model_key = ModelKey(key="test-model", descriptor=fsd) batch = RequestBatch([request], None, model_key) with pytest.raises(sse.SmartSimError) as ex: @@ -245,14 +243,12 @@ def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None: feature_store = MemoryFeatureStore() fsd = feature_store.descriptor - request = InferenceRequest( - input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)] - ) + request = InferenceRequest(input_keys=[TensorKey(key=tensor_name, descriptor=fsd)]) # put model bytes into the feature store feature_store[tensor_name] = persist_torch_tensor.read_bytes() - model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + model_key = ModelKey(key="test-model", descriptor=fsd) batch = RequestBatch([request], None, model_key) fetch_result = worker.fetch_inputs(batch, {fsd: feature_store}) @@ -284,13 +280,13 @@ def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) -> request = InferenceRequest( input_keys=[ - FeatureStoreKey(key=tensor_name + "1", descriptor=fsd), - FeatureStoreKey(key=tensor_name + "2", descriptor=fsd), - FeatureStoreKey(key=tensor_name + "3", descriptor=fsd), + TensorKey(key=tensor_name + "1", descriptor=fsd), + TensorKey(key=tensor_name + "2", descriptor=fsd), + TensorKey(key=tensor_name + "3", descriptor=fsd), ] ) - model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + model_key = ModelKey(key="test-model", descriptor=fsd) batch = RequestBatch([request], None, model_key) fetch_result = worker.fetch_inputs(batch, {fsd: feature_store}) @@ -310,9 +306,9 @@ def test_fetch_input_feature_store_missing() -> None: key = "bad-key" feature_store = MemoryFeatureStore() fsd = feature_store.descriptor - request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) + request = InferenceRequest(input_keys=[TensorKey(key=key, descriptor=fsd)]) - model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + model_key = ModelKey(key="test-model", descriptor=fsd) batch = RequestBatch([request], None, model_key) with pytest.raises(sse.SmartSimError) as ex: @@ -332,9 +328,9 @@ def test_fetch_input_memory(persist_torch_tensor: pathlib.Path) -> None: key = "test-model" feature_store[key] = persist_torch_tensor.read_bytes() - request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) + request = InferenceRequest(input_keys=[TensorKey(key=key, descriptor=fsd)]) - model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + model_key = ModelKey(key="test-model", descriptor=fsd) batch = RequestBatch([request], None, model_key) fetch_result = worker.fetch_inputs(batch, {fsd: feature_store}) @@ -351,9 +347,9 @@ def test_place_outputs() -> None: # create a key to retrieve from the feature store keys = [ - FeatureStoreKey(key=key_name + "1", descriptor=fsd), - FeatureStoreKey(key=key_name + "2", descriptor=fsd), - FeatureStoreKey(key=key_name + "3", descriptor=fsd), + TensorKey(key=key_name + "1", descriptor=fsd), + TensorKey(key=key_name + "2", descriptor=fsd), + TensorKey(key=key_name + "3", descriptor=fsd), ] data = [b"abcdef", b"ghijkl", b"mnopqr"] @@ -376,6 +372,6 @@ def test_place_outputs() -> None: pytest.param("key", "", id="invalid descriptor"), ], ) -def test_invalid_featurestorekey(key, descriptor) -> None: +def test_invalid_tensorkey(key, descriptor) -> None: with pytest.raises(ValueError): - fsk = FeatureStoreKey(key, descriptor) + fsk = TensorKey(key, descriptor) diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py index c58879cb62..d270e921cb 100644 --- a/tests/dragon/test_device_manager.py +++ b/tests/dragon/test_device_manager.py @@ -36,7 +36,8 @@ ) from smartsim._core.mli.infrastructure.storage.feature_store import ( FeatureStore, - FeatureStoreKey, + ModelKey, + TensorKey, ) from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, @@ -116,9 +117,9 @@ def test_device_manager_model_in_request(): worker = MockWorker() - tensor_key = FeatureStoreKey(key="key", descriptor="desc") - output_key = FeatureStoreKey(key="key", descriptor="desc") - model_key = FeatureStoreKey(key="model key", descriptor="desc") + tensor_key = TensorKey(key="key", descriptor="desc") + output_key = TensorKey(key="key", descriptor="desc") + model_key = ModelKey(key="model key", descriptor="desc") request = InferenceRequest( model_key=model_key, @@ -154,9 +155,9 @@ def test_device_manager_model_key(): worker = MockWorker() - tensor_key = FeatureStoreKey(key="key", descriptor="desc") - output_key = FeatureStoreKey(key="key", descriptor="desc") - model_key = FeatureStoreKey(key="model key", descriptor="desc") + tensor_key = TensorKey(key="key", descriptor="desc") + output_key = TensorKey(key="key", descriptor="desc") + model_key = ModelKey(key="model key", descriptor="desc") request = InferenceRequest( model_key=model_key, diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py new file mode 100644 index 0000000000..2b2ef50f99 --- /dev/null +++ b/tests/dragon/test_dragon_backend.py @@ -0,0 +1,307 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import time +import uuid + +import pytest + +dragon = pytest.importorskip("dragon") + + +from smartsim._core.launcher.dragon.dragonBackend import DragonBackend +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.infrastructure.comm.event import ( + OnCreateConsumer, + OnShutdownRequested, +) +from smartsim._core.mli.infrastructure.control.listener import ( + ConsumerRegistrationListener, +) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim.log import get_logger + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon +logger = get_logger(__name__) + + +@pytest.fixture(scope="module") +def the_backend() -> DragonBackend: + return DragonBackend(pid=9999) + + +def test_dragonbackend_start_listener(the_backend: DragonBackend): + """Verify the background process listening to consumer registration events + is up and processing messages as expected.""" + + # We need to let the backend create the backbone to continue + backbone = the_backend._create_backbone() + backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS) + backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER) + + os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone.descriptor + + with pytest.raises(KeyError) as ex: + # we expect the value of the consumer to be empty until + # the listener start-up completes. + backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] + + assert "not found" in ex.value.args[0] + + drg_process = the_backend.start_event_listener(cpu_affinity=[], gpu_affinity=[]) + + # # confirm there is a process still running + logger.info(f"Dragon process started: {drg_process}") + assert drg_process is not None, "Backend was unable to start event listener" + assert drg_process.puid != 0, "Process unique ID is empty" + assert drg_process.returncode is None, "Listener terminated early" + + # wait for the event listener to come up + try: + config = backbone.wait_for( + [BackboneFeatureStore.MLI_REGISTRAR_CONSUMER], timeout=30 + ) + # verify result was in the returned configuration map + assert config[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] + except Exception: + raise KeyError( + f"Unable to locate {BackboneFeatureStore.MLI_REGISTRAR_CONSUMER}" + "in the backbone" + ) + + # wait_for ensures the normal retrieval will now work, error-free + descriptor = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] + assert descriptor is not None + + # register a new listener channel + comm_channel = DragonCommChannel.from_descriptor(descriptor) + mock_descriptor = str(uuid.uuid4()) + event = OnCreateConsumer("test_dragonbackend_start_listener", mock_descriptor, []) + + event_bytes = bytes(event) + comm_channel.send(event_bytes) + + subscriber_list = [] + + # Give the channel time to write the message and the listener time to handle it + for i in range(20): + time.sleep(1) + # Retrieve the subscriber list from the backbone and verify it is updated + if subscriber_list := backbone.notification_channels: + logger.debug(f"The subscriber list was populated after {i} iterations") + break + + assert mock_descriptor in subscriber_list + + # now send a shutdown message to terminate the listener + return_code = drg_process.returncode + + # clean up if the OnShutdownRequested wasn't properly handled + if return_code is None and drg_process.is_alive: + drg_process.kill() + drg_process.join() + + +def test_dragonbackend_backend_consumer(the_backend: DragonBackend): + """Verify the listener background process updates the appropriate + value in the backbone.""" + + # We need to let the backend create the backbone to continue + backbone = the_backend._create_backbone() + backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS) + backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER) + + assert backbone._allow_reserved_writes + + # create listener with `as_service=False` to perform a single loop iteration + listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, as_service=False) + + logger.debug(f"backbone loaded? {listener._backbone}") + logger.debug(f"listener created? {listener}") + + try: + # call the service execute method directly to trigger + # the entire service lifecycle + listener.execute() + + consumer_desc = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] + logger.debug(f"MLI_REGISTRAR_CONSUMER: {consumer_desc}") + + assert consumer_desc + except Exception as ex: + logger.info("") + finally: + listener._on_shutdown() + + +def test_dragonbackend_event_handled(the_backend: DragonBackend): + """Verify the event listener process updates the appropriate + value in the backbone when an event is received and again on shutdown. + """ + # We need to let the backend create the backbone to continue + backbone = the_backend._create_backbone() + backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS) + backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER) + + # create the listener to be tested + listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, as_service=False) + + assert listener._backbone, "The listener is not attached to a backbone" + + try: + # set up the listener but don't let the service event loop start + listener._create_eventing() # listener.execute() + + # grab the channel descriptor so we can simulate registrations + channel_desc = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] + comm_channel = DragonCommChannel.from_descriptor(channel_desc) + + num_events = 5 + events = [] + for i in range(num_events): + # register some mock consumers using the backend channel + event = OnCreateConsumer( + "test_dragonbackend_event_handled", + f"mock-consumer-descriptor-{uuid.uuid4()}", + [], + ) + event_bytes = bytes(event) + comm_channel.send(event_bytes) + events.append(event) + + # run few iterations of the event loop in case it takes a few cycles to write + for _ in range(20): + listener._on_iteration() + # Grab the value that should be getting updated + notify_consumers = set(backbone.notification_channels) + if len(notify_consumers) == len(events): + logger.info(f"Retrieved all consumers after {i} listen cycles") + break + + # ... and confirm that all the mock consumer descriptors are registered + assert set([e.descriptor for e in events]) == set(notify_consumers) + logger.info(f"Number of registered consumers: {len(notify_consumers)}") + + except Exception as ex: + logger.exception(f"test_dragonbackend_event_handled - exception occurred: {ex}") + assert False + finally: + # shutdown should unregister a registration listener + listener._on_shutdown() + + for i in range(10): + if BackboneFeatureStore.MLI_REGISTRAR_CONSUMER not in backbone: + logger.debug(f"The listener was removed after {i} iterations") + channel_desc = None + break + + # we should see that there is no listener registered + assert not channel_desc, "Listener shutdown failed to clean up the backbone" + + +def test_dragonbackend_shutdown_event(the_backend: DragonBackend): + """Verify the background process shuts down when it receives a + shutdown request.""" + + # We need to let the backend create the backbone to continue + backbone = the_backend._create_backbone() + backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS) + backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER) + + listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, as_service=True) + + # set up the listener but don't let the listener loop start + listener._create_eventing() # listener.execute() + + # grab the channel descriptor so we can publish to it + channel_desc = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] + comm_channel = DragonCommChannel.from_descriptor(channel_desc) + + assert listener._consumer.listening, "Listener isn't ready to listen" + + # send a shutdown request... + event = OnShutdownRequested("test_dragonbackend_shutdown_event") + event_bytes = bytes(event) + comm_channel.send(event_bytes, 0.1) + + # execute should encounter the shutdown and exit + listener.execute() + + # ...and confirm the listener is now cancelled + assert not listener._consumer.listening + + +@pytest.mark.parametrize("health_check_frequency", [10, 20]) +def test_dragonbackend_shutdown_on_health_check( + the_backend: DragonBackend, + health_check_frequency: float, +): + """Verify that the event listener automatically shuts down when + a new listener is registered in its place. + + :param health_check_frequency: The expected frequency of service health check + invocations""" + + # We need to let the backend create the backbone to continue + backbone = the_backend._create_backbone() + backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS) + backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER) + + listener = ConsumerRegistrationListener( + backbone, + 1.0, + 1.0, + as_service=True, # allow service to run long enough to health check + health_check_frequency=health_check_frequency, + ) + + # set up the listener but don't let the listener loop start + listener._create_eventing() # listener.execute() + assert listener._consumer.listening, "Listener wasn't ready to listen" + + # Replace the consumer descriptor in the backbone to trigger + # an automatic shutdown + backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] = str(uuid.uuid4()) + + # set the last health check manually to verify the duration + start_at = time.time() + listener._last_health_check = time.time() + + # run execute to let the service trigger health checks + listener.execute() + elapsed = time.time() - start_at + + # confirm the frequency of the health check was honored + assert elapsed >= health_check_frequency + + # ...and confirm the listener is now cancelled + assert ( + not listener._consumer.listening + ), "Listener was not automatically shutdown by the health check" diff --git a/tests/dragon/test_dragon_ddict_utils.py b/tests/dragon/test_dragon_ddict_utils.py new file mode 100644 index 0000000000..c8bf687ef1 --- /dev/null +++ b/tests/dragon/test_dragon_ddict_utils.py @@ -0,0 +1,117 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +dragon = pytest.importorskip("dragon") + +# isort: off +import dragon.data.ddict.ddict as dragon_ddict + +# isort: on + +from smartsim._core.mli.infrastructure.storage import dragon_util +from smartsim.log import get_logger + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon +logger = get_logger(__name__) + + +@pytest.mark.parametrize( + "num_nodes, num_managers, mem_per_node", + [ + pytest.param(1, 1, 3 * 1024**2, id="3MB, Bare minimum allocation"), + pytest.param(2, 2, 128 * 1024**2, id="128 MB allocation, 2 nodes, 2 mgr"), + pytest.param(2, 1, 512 * 1024**2, id="512 MB allocation, 2 nodes, 1 mgr"), + ], +) +def test_dragon_storage_util_create_ddict( + num_nodes: int, + num_managers: int, + mem_per_node: int, +): + """Verify that a dragon dictionary is successfully created. + + :param num_nodes: Number of ddict nodes to attempt to create + :param num_managers: Number of managers per node to request + :param num_managers: Memory to allocate per node + """ + ddict = dragon_util.create_ddict(num_nodes, num_managers, mem_per_node) + + assert ddict is not None + + +@pytest.mark.parametrize( + "num_nodes, num_managers, mem_per_node", + [ + pytest.param(-1, 1, 3 * 1024**2, id="Negative Node Count"), + pytest.param(0, 1, 3 * 1024**2, id="Invalid Node Count"), + pytest.param(1, -1, 3 * 1024**2, id="Negative Mgr Count"), + pytest.param(1, 0, 3 * 1024**2, id="Invalid Mgr Count"), + pytest.param(1, 1, -3 * 1024**2, id="Negative Mem Per Node"), + pytest.param(1, 1, (3 * 1024**2) - 1, id="Invalid Mem Per Node"), + pytest.param(1, 1, 0 * 1024**2, id="No Mem Per Node"), + ], +) +def test_dragon_storage_util_create_ddict_validators( + num_nodes: int, + num_managers: int, + mem_per_node: int, +): + """Verify that a dragon dictionary is successfully created. + + :param num_nodes: Number of ddict nodes to attempt to create + :param num_managers: Number of managers per node to request + :param num_managers: Memory to allocate per node + """ + with pytest.raises(ValueError): + dragon_util.create_ddict(num_nodes, num_managers, mem_per_node) + + +def test_dragon_storage_util_get_ddict_descriptor(the_storage: dragon_ddict.DDict): + """Verify that a descriptor is created. + + :param the_storage: A pre-allocated ddict + """ + value = dragon_util.ddict_to_descriptor(the_storage) + + assert isinstance(value, str) + assert len(value) > 0 + + +def test_dragon_storage_util_get_ddict_from_descriptor(the_storage: dragon_ddict.DDict): + """Verify that a ddict is created from a descriptor. + + :param the_storage: A pre-allocated ddict + """ + descriptor = dragon_util.ddict_to_descriptor(the_storage) + + value = dragon_util.descriptor_to_ddict(descriptor) + + assert value is not None + assert isinstance(value, dragon_ddict.DDict) + assert dragon_util.ddict_to_descriptor(value) == descriptor diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index e9bcc8dfd9..07b2a45c1c 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -28,15 +28,15 @@ dragon = pytest.importorskip("dragon") +import dragon.data.ddict.ddict as dragon_ddict import dragon.utils as du -from dragon.channels import Channel -from dragon.data.ddict.ddict import DDict -from dragon.fli import DragonFLIError, FLInterface +from dragon.fli import FLInterface from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( DragonFeatureStore, ) from smartsim.error.errors import SmartSimError @@ -53,11 +53,12 @@ ], ) def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch): - """A descriptor can be stored, loaded, and reattached""" - chan = Channel.make_process_local() + """A descriptor can be stored, loaded, and reattached.""" + chan = create_local() queue = FLInterface(main_ch=chan) monkeypatch.setenv( - "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR, + du.B64.bytes_to_str(queue.serialize()), ) config = EnvironmentConfigLoader( @@ -76,11 +77,12 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): """The serialized descriptors of a loaded and unloaded - queue are the same""" - chan = Channel.make_process_local() + queue are the same.""" + chan = create_local() queue = FLInterface(main_ch=chan) monkeypatch.setenv( - "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR, + du.B64.bytes_to_str(queue.serialize()), ) config = EnvironmentConfigLoader( @@ -93,8 +95,10 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): - """An incorrect serialized descriptor will fails to attach""" - monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring") + """An incorrect serialized descriptor will fails to attach.""" + + monkeypatch.setenv(EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR, "randomstring") + config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=None, @@ -105,11 +109,15 @@ def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): config.get_queue() -def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch): +def test_environment_loader_backbone_load_dfs( + monkeypatch: pytest.MonkeyPatch, the_storage: dragon_ddict.DDict +): """Verify the dragon feature store is loaded correctly by the - EnvironmentConfigLoader to demonstrate featurestore_factory correctness""" - feature_store = DragonFeatureStore(DDict()) - monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor) + EnvironmentConfigLoader to demonstrate featurestore_factory correctness.""" + feature_store = DragonFeatureStore(the_storage) + monkeypatch.setenv( + EnvironmentConfigLoader.BACKBONE_ENV_VAR, feature_store.descriptor + ) config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -123,13 +131,17 @@ def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch): assert backbone is not None -def test_environment_variables_not_set(): +def test_environment_variables_not_set(monkeypatch: pytest.MonkeyPatch): """EnvironmentConfigLoader getters return None when environment - variables are not set""" - config = EnvironmentConfigLoader( - featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=DragonCommChannel.from_descriptor, - queue_factory=DragonCommChannel.from_descriptor, - ) - assert config.get_backbone() is None - assert config.get_queue() is None + variables are not set.""" + with monkeypatch.context() as patch: + patch.setenv(EnvironmentConfigLoader.BACKBONE_ENV_VAR, "") + patch.setenv(EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR, "") + + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonCommChannel.from_descriptor, + ) + assert config.get_backbone() is None + assert config.get_queue() is None diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 618b00d87e..aacd47b556 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import typing as t from unittest.mock import MagicMock import pytest @@ -32,14 +33,13 @@ import multiprocessing as mp -import dragon.utils as du from dragon.channels import Channel from dragon.data.ddict.ddict import DDict from dragon.fli import FLInterface from dragon.mpbridge.queues import DragonQueue +from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel -from smartsim._core.mli.infrastructure.control.device_manager import WorkerDevice from smartsim._core.mli.infrastructure.control.request_dispatcher import ( RequestDispatcher, ) @@ -48,25 +48,30 @@ exception_handler, ) from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) from smartsim._core.mli.infrastructure.storage.feature_store import ( FeatureStore, - FeatureStoreKey, + ModelKey, + TensorKey, ) from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, FetchInputResult, FetchModelResult, - InferenceReply, InferenceRequest, LoadModelResult, + MachineLearningWorkerBase, RequestBatch, TransformInputResult, TransformOutputResult, ) from smartsim._core.mli.message_handler import MessageHandler +from smartsim._core.mli.mli_schemas.response.response_capnp import ResponseBuilder from .utils.channel import FileSystemCommChannel from .utils.worker import IntegratedTorchWorker @@ -75,37 +80,29 @@ pytestmark = pytest.mark.dragon -@pytest.fixture -def backbone_descriptor() -> str: - # create a shared backbone featurestore - feature_store = DragonFeatureStore(DDict()) - return feature_store.descriptor - - -@pytest.fixture -def app_feature_store() -> FeatureStore: +@pytest.fixture(scope="module") +def app_feature_store(the_storage) -> FeatureStore: # create a standalone feature store to mimic a user application putting # data into an application-owned resource (app should not access backbone) - app_fs = DragonFeatureStore(DDict()) + app_fs = DragonFeatureStore(the_storage) return app_fs @pytest.fixture def setup_worker_manager_model_bytes( - test_dir, + test_dir: str, monkeypatch: pytest.MonkeyPatch, backbone_descriptor: str, app_feature_store: FeatureStore, + the_worker_channel: DragonFLIChannel, ): integrated_worker_type = IntegratedTorchWorker - chan = Channel.make_process_local() - queue = FLInterface(main_ch=chan) monkeypatch.setenv( - "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor ) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` - monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) + monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor) config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -113,7 +110,7 @@ def setup_worker_manager_model_bytes( queue_factory=DragonFLIChannel.from_descriptor, ) - dispatcher_task_queue = mp.Queue(maxsize=0) + dispatcher_task_queue: mp.Queue[RequestBatch] = mp.Queue(maxsize=0) worker_manager = WorkerManager( config_loader=config_loader, @@ -123,10 +120,10 @@ def setup_worker_manager_model_bytes( cooldown=3, ) - tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) - output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) + tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) + output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) - request = InferenceRequest( + inf_request = InferenceRequest( model_key=None, callback=None, raw_inputs=None, @@ -137,10 +134,10 @@ def setup_worker_manager_model_bytes( batch_size=0, ) - model_id = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) + model_id = ModelKey(key="key", descriptor=app_feature_store.descriptor) request_batch = RequestBatch( - [request], + [inf_request], TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]), model_id=model_id, ) @@ -155,16 +152,15 @@ def setup_worker_manager_model_key( monkeypatch: pytest.MonkeyPatch, backbone_descriptor: str, app_feature_store: FeatureStore, + the_worker_channel: DragonFLIChannel, ): integrated_worker_type = IntegratedTorchWorker - chan = Channel.make_process_local() - queue = FLInterface(main_ch=chan) monkeypatch.setenv( - "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor ) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` - monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) + monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor) config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -172,7 +168,7 @@ def setup_worker_manager_model_key( queue_factory=DragonFLIChannel.from_descriptor, ) - dispatcher_task_queue = mp.Queue(maxsize=0) + dispatcher_task_queue: mp.Queue[RequestBatch] = mp.Queue(maxsize=0) worker_manager = WorkerManager( config_loader=config_loader, @@ -182,9 +178,9 @@ def setup_worker_manager_model_key( cooldown=3, ) - tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) - output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) - model_id = FeatureStoreKey(key="model key", descriptor=app_feature_store.descriptor) + tensor_key = TensorKey(key="key", descriptor=app_feature_store.descriptor) + output_key = TensorKey(key="key", descriptor=app_feature_store.descriptor) + model_id = ModelKey(key="model key", descriptor=app_feature_store.descriptor) request = InferenceRequest( model_key=model_id, @@ -208,20 +204,19 @@ def setup_worker_manager_model_key( @pytest.fixture def setup_request_dispatcher_model_bytes( - test_dir, + test_dir: str, monkeypatch: pytest.MonkeyPatch, backbone_descriptor: str, app_feature_store: FeatureStore, + the_worker_channel: DragonFLIChannel, ): integrated_worker_type = IntegratedTorchWorker - chan = Channel.make_process_local() - queue = FLInterface(main_ch=chan) monkeypatch.setenv( - "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor ) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` - monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) + monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor) config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -252,20 +247,19 @@ def setup_request_dispatcher_model_bytes( @pytest.fixture def setup_request_dispatcher_model_key( - test_dir, + test_dir: str, monkeypatch: pytest.MonkeyPatch, backbone_descriptor: str, app_feature_store: FeatureStore, + the_worker_channel: DragonFLIChannel, ): integrated_worker_type = IntegratedTorchWorker - chan = Channel.make_process_local() - queue = FLInterface(main_ch=chan) monkeypatch.setenv( - "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor ) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` - monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) + monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor) config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -284,7 +278,7 @@ def setup_request_dispatcher_model_key( tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) model_key = MessageHandler.build_model_key( - key="model key", feature_store_descriptor=app_feature_store.descriptor + key="model key", descriptor=app_feature_store.descriptor ) request = MessageHandler.build_request( test_dir, model_key, [tensor_key], [output_key], [], None @@ -296,8 +290,12 @@ def setup_request_dispatcher_model_key( return request_dispatcher, integrated_worker_type -def mock_pipeline_stage(monkeypatch: pytest.MonkeyPatch, integrated_worker, stage): - def mock_stage(*args, **kwargs): +def mock_pipeline_stage( + monkeypatch: pytest.MonkeyPatch, + integrated_worker: MachineLearningWorkerBase, + stage: str, +) -> t.Callable[[t.Any], ResponseBuilder]: + def mock_stage(*args: t.Any, **kwargs: t.Any) -> None: raise ValueError(f"Simulated error in {stage}") monkeypatch.setattr(integrated_worker, stage, mock_stage) @@ -314,8 +312,10 @@ def mock_stage(*args, **kwargs): mock_reply_channel = MagicMock() mock_reply_channel.send = MagicMock() - def mock_exception_handler(exc, reply_channel, failure_message): - return exception_handler(exc, mock_reply_channel, failure_message) + def mock_exception_handler( + exc: Exception, reply_channel: CommChannelBase, failure_message: str + ) -> None: + exception_handler(exc, mock_reply_channel, failure_message) monkeypatch.setattr( "smartsim._core.mli.infrastructure.control.worker_manager.exception_handler", @@ -362,12 +362,12 @@ def mock_exception_handler(exc, reply_channel, failure_message): ], ) def test_wm_pipeline_stage_errors_handled( - request, - setup_worker_manager, + request: pytest.FixtureRequest, + setup_worker_manager: str, monkeypatch: pytest.MonkeyPatch, stage: str, error_message: str, -): +) -> None: """Ensures that the worker manager does not crash after a failure in various pipeline stages""" worker_manager, integrated_worker_type = request.getfixturevalue( setup_worker_manager @@ -446,12 +446,12 @@ def test_wm_pipeline_stage_errors_handled( ], ) def test_dispatcher_pipeline_stage_errors_handled( - request, - setup_request_dispatcher, + request: pytest.FixtureRequest, + setup_request_dispatcher: str, monkeypatch: pytest.MonkeyPatch, stage: str, error_message: str, -): +) -> None: """Ensures that the request dispatcher does not crash after a failure in various pipeline stages""" request_dispatcher, integrated_worker_type = request.getfixturevalue( setup_request_dispatcher @@ -473,7 +473,7 @@ def test_dispatcher_pipeline_stage_errors_handled( mock_reply_fn.assert_called_with("fail", error_message) -def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch): +def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch) -> None: """Ensures that the worker manager does not crash after a failure in the execute pipeline stage""" @@ -498,3 +498,14 @@ def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch): mock_reply_fn.assert_called_once() mock_reply_fn.assert_called_with("fail", "Failure while fetching the model.") + + +def test_dragon_feature_store_invalid_storage(): + """Verify that attempting to create a DragonFeatureStore without storage fails.""" + storage = None + + with pytest.raises(ValueError) as ex: + DragonFeatureStore(storage) + + assert "storage" in ex.value.args[0].lower() + assert "required" in ex.value.args[0].lower() diff --git a/tests/dragon/test_event_consumer.py b/tests/dragon/test_event_consumer.py new file mode 100644 index 0000000000..8a241bab19 --- /dev/null +++ b/tests/dragon/test_event_consumer.py @@ -0,0 +1,386 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time +import typing as t +from unittest import mock + +import pytest + +dragon = pytest.importorskip("dragon") + +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local +from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster +from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer +from smartsim._core.mli.infrastructure.comm.event import ( + OnCreateConsumer, + OnShutdownRequested, + OnWriteFeatureStore, +) +from smartsim._core.mli.infrastructure.control.listener import ( + ConsumerRegistrationListener, +) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim.log import get_logger + +logger = get_logger(__name__) + +# isort: off +from dragon import fli +from dragon.channels import Channel + +# isort: on + +if t.TYPE_CHECKING: + import conftest + + +# The tests in this file must run in a dragon environment +pytestmark = pytest.mark.dragon + + +def test_eventconsumer_eventpublisher_integration( + the_backbone: t.Any, test_dir: str +) -> None: + """Verify that the publisher and consumer integrate as expected when + multiple publishers and consumers are sending simultaneously. This + test closely tracks the test in tests/test_featurestore_base.py also named + test_eventconsumer_eventpublisher_integration but requires dragon entities. + + :param the_backbone: The BackboneFeatureStore to use + :param test_dir: Automatically generated unique working + directories for individual test outputs + """ + + wmgr_channel = DragonCommChannel(create_local()) + capp_channel = DragonCommChannel(create_local()) + back_channel = DragonCommChannel(create_local()) + + wmgr_consumer_descriptor = wmgr_channel.descriptor + capp_consumer_descriptor = capp_channel.descriptor + back_consumer_descriptor = back_channel.descriptor + + # create some consumers to receive messages + wmgr_consumer = EventConsumer( + wmgr_channel, + the_backbone, + filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN], + ) + capp_consumer = EventConsumer( + capp_channel, + the_backbone, + ) + back_consumer = EventConsumer( + back_channel, + the_backbone, + filters=[OnCreateConsumer.CONSUMER_CREATED], + ) + + # create some broadcasters to publish messages + mock_worker_mgr = EventBroadcaster( + the_backbone, + channel_factory=DragonCommChannel.from_descriptor, + ) + mock_client_app = EventBroadcaster( + the_backbone, + channel_factory=DragonCommChannel.from_descriptor, + ) + + # register all of the consumers even though the OnCreateConsumer really should + # trigger its registration. event processing is tested elsewhere. + the_backbone.notification_channels = [ + wmgr_consumer_descriptor, + capp_consumer_descriptor, + back_consumer_descriptor, + ] + + # simulate worker manager sending a notification to backend that it's alive + event_1 = OnCreateConsumer( + "test_eventconsumer_eventpublisher_integration", + wmgr_consumer_descriptor, + filters=[], + ) + mock_worker_mgr.send(event_1) + + # simulate the app updating a model a few times + for key in ["key-1", "key-2", "key-1"]: + event = OnWriteFeatureStore( + "test_eventconsumer_eventpublisher_integration", + the_backbone.descriptor, + key, + ) + mock_client_app.send(event, timeout=0.1) + + # worker manager should only get updates about feature update + wmgr_messages = wmgr_consumer.recv() + assert len(wmgr_messages) == 3 + + # the backend should only receive messages about consumer creation + back_messages = back_consumer.recv() + assert len(back_messages) == 1 + + # hypothetical app has no filters and will get all events + app_messages = capp_consumer.recv() + assert len(app_messages) == 4 + + +@pytest.mark.parametrize( + " timeout, batch_timeout, exp_err_msg", + [(-1, 1, " timeout"), (1, -1, "batch_timeout")], +) +def test_eventconsumer_invalid_timeout( + timeout: float, + batch_timeout: float, + exp_err_msg: str, + test_dir: str, + the_backbone: BackboneFeatureStore, +) -> None: + """Verify that the event consumer raises an exception + when provided an invalid request timeout. + + :param timeout: The request timeout for the event consumer recv call + :param batch_timeout: The batch timeout for the event consumer recv call + :param exp_err_msg: A unique value from the error message that should be raised + :param the_storage: The dragon storage engine to use + :param test_dir: Automatically generated unique working + directories for individual test outputs + """ + + wmgr_channel = DragonCommChannel(create_local()) + + # create some consumers to receive messages + wmgr_consumer = EventConsumer( + wmgr_channel, + the_backbone, + filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN], + ) + + # the consumer should report an error for the invalid timeout value + with pytest.raises(ValueError) as ex: + wmgr_consumer.recv(timeout=timeout, batch_timeout=batch_timeout) + + assert exp_err_msg in ex.value.args[0] + + +def test_eventconsumer_no_event_handler_registered( + the_backbone: t.Any, test_dir: str +) -> None: + """Verify that a consumer discards messages when + on a channel if no handler is registered. + + :param the_backbone: The BackboneFeatureStore to use + :param test_dir: Automatically generated unique working + directories for individual test outputs + """ + + wmgr_channel = DragonCommChannel(create_local()) + + # create a consumer to receive messages + wmgr_consumer = EventConsumer(wmgr_channel, the_backbone, event_handler=None) + + # create a broadcasters to publish messages + mock_worker_mgr = EventBroadcaster( + the_backbone, + channel_factory=DragonCommChannel.from_descriptor, + ) + + # manually register the consumers since we don't have a backend running + the_backbone.notification_channels = [wmgr_channel.descriptor] + + # simulate the app updating a model a few times + for key in ["key-1", "key-2", "key-1"]: + event = OnWriteFeatureStore( + "test_eventconsumer_no_event_handler_registered", + the_backbone.descriptor, + key, + ) + mock_worker_mgr.send(event, timeout=0.1) + + # run the handler and let it discard messages + for _ in range(15): + wmgr_consumer.listen_once(0.2, 2.0) + + assert wmgr_consumer.listening + + +def test_eventconsumer_no_event_handler_registered_shutdown( + the_backbone: t.Any, test_dir: str +) -> None: + """Verify that a consumer without an event handler + registered still honors shutdown requests. + + :param the_backbone: The BackboneFeatureStore to use + :param test_dir: Automatically generated unique working + directories for individual test outputs + """ + + wmgr_channel = DragonCommChannel(create_local()) + capp_channel = DragonCommChannel(create_local()) + + # create a consumers to receive messages + wmgr_consumer = EventConsumer(wmgr_channel, the_backbone) + + # create a broadcaster to publish messages + mock_worker_mgr = EventBroadcaster( + the_backbone, + channel_factory=DragonCommChannel.from_descriptor, + ) + + # manually register the consumers since we don't have a backend running + the_backbone.notification_channels = [ + wmgr_channel.descriptor, + capp_channel.descriptor, + ] + + # simulate the app updating a model a few times + for key in ["key-1", "key-2", "key-1"]: + event = OnWriteFeatureStore( + "test_eventconsumer_no_event_handler_registered_shutdown", + the_backbone.descriptor, + key, + ) + mock_worker_mgr.send(event, timeout=0.1) + + event = OnShutdownRequested( + "test_eventconsumer_no_event_handler_registered_shutdown" + ) + mock_worker_mgr.send(event, timeout=0.1) + + # wmgr will stop listening to messages when it is told to stop listening + wmgr_consumer.listen(timeout=0.1, batch_timeout=2.0) + + for _ in range(15): + wmgr_consumer.listen_once(timeout=0.1, batch_timeout=2.0) + + # confirm the messages were processed, discarded, and the shutdown was received + assert wmgr_consumer.listening == False + + +def test_eventconsumer_registration( + the_backbone: t.Any, test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Verify that a consumer is correctly registered in + the backbone after sending a registration request. Then, + Confirm the consumer is unregistered after sending the + un-register request. + + :param the_backbone: The BackboneFeatureStore to use + :param test_dir: Automatically generated unique working + directories for individual test outputs + """ + + with monkeypatch.context() as patch: + registrar = ConsumerRegistrationListener( + the_backbone, 1.0, 2.0, as_service=False + ) + + # NOTE: service.execute(as_service=False) will complete the service life- + # cycle and remove the registrar from the backbone, so mock _on_shutdown + disabled_shutdown = mock.MagicMock() + patch.setattr(registrar, "_on_shutdown", disabled_shutdown) + + # initialze registrar resources + registrar.execute() + + # create a consumer that will be registered + wmgr_channel = DragonCommChannel(create_local()) + wmgr_consumer = EventConsumer(wmgr_channel, the_backbone) + + registered_channels = the_backbone.notification_channels + + # trigger the consumer-to-registrar handshake + wmgr_consumer.register() + + current_registrations: t.List[str] = [] + + # have the registrar run a few times to pick up the msg + for i in range(15): + registrar.execute() + current_registrations = the_backbone.notification_channels + if len(current_registrations) != len(registered_channels): + logger.debug(f"The event was processed on iteration {i}") + break + + # confirm the consumer is registered + assert wmgr_channel.descriptor in current_registrations + + # copy old list so we can compare against it. + registered_channels = list(current_registrations) + + # trigger the consumer removal + wmgr_consumer.unregister() + + # have the registrar run a few times to pick up the msg + for i in range(15): + registrar.execute() + current_registrations = the_backbone.notification_channels + if len(current_registrations) != len(registered_channels): + logger.debug(f"The event was processed on iteration {i}") + break + + # confirm the consumer is no longer registered + assert wmgr_channel.descriptor not in current_registrations + + +def test_registrar_teardown( + the_backbone: t.Any, test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Verify that the consumer registrar removes itself from + the backbone when it shuts down. + + :param the_backbone: The BackboneFeatureStore to use + :param test_dir: Automatically generated unique working + directories for individual test outputs + """ + + with monkeypatch.context() as patch: + registrar = ConsumerRegistrationListener( + the_backbone, 1.0, 2.0, as_service=False + ) + + # directly initialze registrar resources to avoid service life-cycle + registrar._create_eventing() + + # confirm the registrar is published to the backbone + cfg = the_backbone.wait_for([BackboneFeatureStore.MLI_REGISTRAR_CONSUMER], 10) + assert BackboneFeatureStore.MLI_REGISTRAR_CONSUMER in cfg + + # execute the entire service lifecycle 1x + registrar.execute() + + consumer_found = BackboneFeatureStore.MLI_REGISTRAR_CONSUMER in the_backbone + + for i in range(15): + time.sleep(0.1) + consumer_found = BackboneFeatureStore.MLI_REGISTRAR_CONSUMER in the_backbone + if not consumer_found: + logger.debug(f"Registrar removed from the backbone on iteration {i}") + break + + assert BackboneFeatureStore.MLI_REGISTRAR_CONSUMER not in the_backbone diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py new file mode 100644 index 0000000000..019dcde7a0 --- /dev/null +++ b/tests/dragon/test_featurestore.py @@ -0,0 +1,327 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import multiprocessing as mp +import random +import time +import typing as t +import unittest.mock as mock +import uuid + +import pytest + +dragon = pytest.importorskip("dragon") + +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + time as bbtime, +) +from smartsim.log import get_logger + +logger = get_logger(__name__) + +# isort: off +from dragon import fli +from dragon.channels import Channel + +# isort: on + +if t.TYPE_CHECKING: + import conftest + + +# The tests in this file must run in a dragon environment +pytestmark = pytest.mark.dragon + + +def test_backbone_wait_for_no_keys( + the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch +) -> None: + """Verify that asking the backbone to wait for a value succeeds + immediately and does not cause a wait to occur if the supplied key + list is empty. + + :param the_backbone: the storage engine to use, prepopulated with + """ + # set a very low timeout to confirm that it does not wait + + with monkeypatch.context() as ctx: + # all keys should be found and the timeout should never be checked. + ctx.setattr(bbtime, "sleep", mock.MagicMock()) + + values = the_backbone.wait_for([]) + assert len(values) == 0 + + # confirm that no wait occurred + bbtime.sleep.assert_not_called() + + +def test_backbone_wait_for_prepopulated( + the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch +) -> None: + """Verify that asking the backbone to wait for a value succeed + immediately and do not cause a wait to occur if the data exists. + + :param the_backbone: the storage engine to use, prepopulated with + """ + # set a very low timeout to confirm that it does not wait + + with monkeypatch.context() as ctx: + # all keys should be found and the timeout should never be checked. + ctx.setattr(bbtime, "sleep", mock.MagicMock()) + + values = the_backbone.wait_for([BackboneFeatureStore.MLI_WORKER_QUEUE], 0.1) + + # confirm that wait_for with one key returns one value + assert len(values) == 1 + + # confirm that the descriptor is non-null w/some non-trivial value + assert len(values[BackboneFeatureStore.MLI_WORKER_QUEUE]) > 5 + + # confirm that no wait occurred + bbtime.sleep.assert_not_called() + + +def test_backbone_wait_for_prepopulated_dupe( + the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch +) -> None: + """Verify that asking the backbone to wait for keys that are duplicated + results in a single value being returned for each key. + + :param the_backbone: the storage engine to use, prepopulated with + """ + # set a very low timeout to confirm that it does not wait + + key1, key2 = "key-1", "key-2" + value1, value2 = "i-am-value-1", "i-am-value-2" + the_backbone[key1] = value1 + the_backbone[key2] = value2 + + with monkeypatch.context() as ctx: + # all keys should be found and the timeout should never be checked. + ctx.setattr(bbtime, "sleep", mock.MagicMock()) + + values = the_backbone.wait_for([key1, key2, key1]) # key1 is duplicated + + # confirm that wait_for with one key returns one value + assert len(values) == 2 + assert key1 in values + assert key2 in values + + assert values[key1] == value1 + assert values[key2] == value2 + + +def set_value_after_delay( + descriptor: str, key: str, value: str, delay: float = 5 +) -> None: + """Helper method to persist a random value into the backbone + + :param descriptor: the backbone feature store descriptor to attach to + :param key: the key to write to + :param value: a value to write to the key + :param delay: amount of delay to apply before writing the key + """ + time.sleep(delay) + + backbone = BackboneFeatureStore.from_descriptor(descriptor) + backbone[key] = value + logger.debug(f"set_value_after_delay wrote `{value} to backbone[`{key}`]") + + +@pytest.mark.parametrize( + "delay", + [ + pytest.param( + 0, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 1, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 2, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 4, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 8, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + ], +) +def test_backbone_wait_for_partial_prepopulated( + the_backbone: BackboneFeatureStore, delay: float +) -> None: + """Verify that when data is not all in the backbone, the `wait_for` operation + continues to poll until it finds everything it needs. + + :param the_backbone: the storage engine to use, prepopulated with + :param delay: the number of seconds the second process will wait before + setting the target value in the backbone featurestore + """ + # set a very low timeout to confirm that it does not wait + wait_timeout = 10 + + key, value = str(uuid.uuid4()), str(random.random() * 10) + + logger.debug(f"Starting process to write {key} after {delay}s") + p = mp.Process( + target=set_value_after_delay, args=(the_backbone.descriptor, key, value, delay) + ) + p.start() + + p2 = mp.Process( + target=the_backbone.wait_for, + args=([BackboneFeatureStore.MLI_WORKER_QUEUE, key],), + kwargs={"timeout": wait_timeout}, + ) + p2.start() + + p.join() + p2.join() + + # both values should be written at this time + ret_vals = the_backbone.wait_for( + [key, BackboneFeatureStore.MLI_WORKER_QUEUE, key], 0.1 + ) + # confirm that wait_for with two keys returns two values + assert len(ret_vals) == 2, "values should contain values for both awaited keys" + + # confirm the pre-populated value has the correct output + assert ( + ret_vals[BackboneFeatureStore.MLI_WORKER_QUEUE] == "12345" + ) # mock descriptor value from fixture + + # confirm the population process completed and the awaited value is correct + assert ret_vals[key] == value, "verify order of values " + + +@pytest.mark.parametrize( + "num_keys", + [ + pytest.param( + 0, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 1, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 3, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 7, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 11, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + ], +) +def test_backbone_wait_for_multikey( + the_backbone: BackboneFeatureStore, + num_keys: int, + test_dir: str, +) -> None: + """Verify that asking the backbone to wait for multiple keys results + in that number of values being returned. + + :param the_backbone: the storage engine to use, prepopulated with + :param num_keys: the number of extra keys to set & request in the backbone + """ + # maximum delay allowed for setter processes + max_delay = 5 + + extra_keys = [str(uuid.uuid4()) for _ in range(num_keys)] + extra_values = [str(uuid.uuid4()) for _ in range(num_keys)] + extras = dict(zip(extra_keys, extra_values)) + delays = [random.random() * max_delay for _ in range(num_keys)] + processes = [] + + for key, value, delay in zip(extra_keys, extra_values, delays): + assert delay < max_delay, "write delay exceeds test timeout" + logger.debug(f"Delaying {key} write by {delay} seconds") + p = mp.Process( + target=set_value_after_delay, + args=(the_backbone.descriptor, key, value, delay), + ) + p.start() + processes.append(p) + + p2 = mp.Process( + target=the_backbone.wait_for, + args=(extra_keys,), + kwargs={"timeout": max_delay * 2}, + ) + p2.start() + for p in processes: + p.join(timeout=max_delay * 2) + p2.join( + timeout=max_delay * 2 + ) # give it 10 seconds longer than p2 timeout for backoff + + # use without a wait to verify all values are written + num_keys = len(extra_keys) + actual_values = the_backbone.wait_for(extra_keys, timeout=0.01) + assert len(extra_keys) == num_keys + + # confirm that wait_for returns all the expected values + assert len(actual_values) == num_keys + + # confirm that the returned values match (e.g. are returned in the right order) + for k in extras: + assert extras[k] == actual_values[k] diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py index 932e734c8a..6daceb9061 100644 --- a/tests/dragon/test_featurestore_base.py +++ b/tests/dragon/test_featurestore_base.py @@ -24,20 +24,22 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pathlib +import time import typing as t import pytest dragon = pytest.importorskip("dragon") -from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( - BackboneFeatureStore, - EventBroadcaster, - EventCategory, - EventConsumer, +from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster +from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer +from smartsim._core.mli.infrastructure.comm.event import ( OnCreateConsumer, OnWriteFeatureStore, ) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) @@ -55,15 +57,21 @@ pytestmark = pytest.mark.dragon +def boom(*args, **kwargs) -> None: + """Helper function that blows up when used to mock up + some other function.""" + raise Exception(f"you shall not pass! {args}, {kwargs}") + + def test_event_uid() -> None: - """Verify that all events include a unique identifier""" + """Verify that all events include a unique identifier.""" uids: t.Set[str] = set() num_iters = 1000 # generate a bunch of events and keep track all the IDs for i in range(num_iters): - event_a = OnCreateConsumer(str(i)) - event_b = OnWriteFeatureStore(str(i), "key") + event_a = OnCreateConsumer("test_event_uid", str(i), filters=[]) + event_b = OnWriteFeatureStore("test_event_uid", "test_event_uid", str(i)) uids.add(event_a.uid) uids.add(event_b.uid) @@ -74,7 +82,7 @@ def test_event_uid() -> None: def test_mli_reserved_keys_conversion() -> None: """Verify that conversion from a string to an enum member - works as expected""" + works as expected.""" for reserved_key in ReservedKeys: # iterate through all keys and verify `from_string` works @@ -87,7 +95,7 @@ def test_mli_reserved_keys_conversion() -> None: def test_mli_reserved_keys_writes() -> None: """Verify that attempts to write to reserved keys are blocked from a - standard DragonFeatureStore but enabled with the BackboneFeatureStore""" + standard DragonFeatureStore but enabled with the BackboneFeatureStore.""" mock_storage = {} dfs = DragonFeatureStore(mock_storage) @@ -116,10 +124,8 @@ def test_mli_reserved_keys_writes() -> None: def test_mli_consumers_read_by_key() -> None: - """Verify that the value returned from the mli consumers - method is written to the correct key and reads are - allowed via standard dragon feature store. - NOTE: should reserved reads also be blocked""" + """Verify that the value returned from the mli consumers method is written + to the correct key and reads are allowed via standard dragon feature store.""" mock_storage = {} dfs = DragonFeatureStore(mock_storage) @@ -138,7 +144,7 @@ def test_mli_consumers_read_by_key() -> None: def test_mli_consumers_read_by_backbone() -> None: """Verify that the backbone reads the correct location - when using the backbone feature store API instead of mapping API""" + when using the backbone feature store API instead of mapping API.""" mock_storage = {} backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) @@ -152,7 +158,7 @@ def test_mli_consumers_read_by_backbone() -> None: def test_mli_consumers_write_by_backbone() -> None: """Verify that the backbone writes the correct location - when using the backbone feature store API instead of mapping API""" + when using the backbone feature store API instead of mapping API.""" mock_storage = {} backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) @@ -166,10 +172,11 @@ def test_mli_consumers_write_by_backbone() -> None: def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None: """Verify that a broadcast operation without any registered subscribers - succeeds without raising Exceptions + succeeds without raising Exceptions. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" mock_storage = {} consumer_descriptor = storage_path / "test-consumer" @@ -177,7 +184,9 @@ def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None: # NOTE: we're not putting any consumers into the backbone here! backbone = BackboneFeatureStore(mock_storage) - event = OnCreateConsumer(consumer_descriptor) + event = OnCreateConsumer( + "test_eventpublisher_broadcast_no_factory", consumer_descriptor, filters=[] + ) publisher = EventBroadcaster(backbone) num_receivers = 0 @@ -185,7 +194,9 @@ def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None: # publishing this event without any known consumers registered should succeed # but report that it didn't have anybody to send the event to consumer_descriptor = storage_path / f"test-consumer" - event = OnCreateConsumer(consumer_descriptor) + event = OnCreateConsumer( + "test_eventpublisher_broadcast_no_factory", consumer_descriptor, filters=[] + ) num_receivers += publisher.send(event) @@ -201,10 +212,11 @@ def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None: def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None: """Verify that a broadcast operation without any registered subscribers - succeeds without raising Exceptions + succeeds without raising Exceptions. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" mock_storage = {} @@ -215,7 +227,11 @@ def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None: backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) backbone.notification_channels = [] - event = OnCreateConsumer(consumer_descriptor) + event = OnCreateConsumer( + "test_eventpublisher_broadcast_to_empty_consumer_list", + consumer_descriptor, + filters=[], + ) publisher = EventBroadcaster( backbone, channel_factory=FileSystemCommChannel.from_descriptor ) @@ -233,10 +249,11 @@ def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None: def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None: """Verify that a broadcast operation reports an error if no channel - factory was supplied for constructing the consumer channels + factory was supplied for constructing the consumer channels. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" mock_storage = {} @@ -247,7 +264,11 @@ def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) backbone.notification_channels = [consumer_descriptor] - event = OnCreateConsumer(consumer_descriptor) + event = OnCreateConsumer( + "test_eventpublisher_broadcast_without_channel_factory", + consumer_descriptor, + filters=[], + ) publisher = EventBroadcaster( backbone, # channel_factory=FileSystemCommChannel.from_descriptor # <--- not supplied @@ -261,10 +282,11 @@ def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None def test_eventpublisher_broadcast_empties_buffer(test_dir: str) -> None: """Verify that a successful broadcast clears messages from the event - buffer when a new message is sent and consumers are registered + buffer when a new message is sent and consumers are registered. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" mock_storage = {} @@ -281,11 +303,17 @@ def test_eventpublisher_broadcast_empties_buffer(test_dir: str) -> None: # mock building up some buffered events num_buffered_events = 14 for i in range(num_buffered_events): - event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}") + event = OnCreateConsumer( + "test_eventpublisher_broadcast_empties_buffer", + storage_path / f"test-consumer-{str(i)}", + [], + ) publisher._event_buffer.append(bytes(event)) event0 = OnCreateConsumer( - storage_path / f"test-consumer-{str(num_buffered_events + 1)}" + "test_eventpublisher_broadcast_empties_buffer", + storage_path / f"test-consumer-{str(num_buffered_events + 1)}", + [], ) num_receivers = publisher.send(event0) @@ -332,13 +360,21 @@ def test_eventpublisher_broadcast_returns_total_sent( # mock building up some buffered events for i in range(num_buffered): - event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}") + event = OnCreateConsumer( + "test_eventpublisher_broadcast_returns_total_sent", + storage_path / f"test-consumer-{str(i)}", + [], + ) publisher._event_buffer.append(bytes(event)) assert publisher.num_buffered == num_buffered # this event will trigger clearing anything already in buffer - event0 = OnCreateConsumer(storage_path / f"test-consumer-{num_buffered}") + event0 = OnCreateConsumer( + "test_eventpublisher_broadcast_returns_total_sent", + storage_path / f"test-consumer-{num_buffered}", + [], + ) # num_receivers should contain a number that computes w/all consumers and all events num_receivers = publisher.send(event0) @@ -347,10 +383,11 @@ def test_eventpublisher_broadcast_returns_total_sent( def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None: - """Verify that any unused consumers are pruned each time a new event is sent + """Verify that any unused consumers are pruned each time a new event is sent. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" mock_storage = {} @@ -363,7 +400,11 @@ def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None: backbone, channel_factory=FileSystemCommChannel.from_descriptor ) - event = OnCreateConsumer(consumer_descriptor) + event = OnCreateConsumer( + "test_eventpublisher_prune_unused_consumer", + consumer_descriptor, + filters=[], + ) # the only registered cnosumer is in the event, expect no pruning backbone.notification_channels = (consumer_descriptor,) @@ -377,7 +418,9 @@ def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None: # ... and remove the old descriptor from the backbone when it's looked up backbone.notification_channels = (consumer_descriptor2,) - event = OnCreateConsumer(consumer_descriptor2) + event = OnCreateConsumer( + "test_eventpublisher_prune_unused_consumer", consumer_descriptor2, filters=[] + ) publisher.send(event) @@ -413,12 +456,13 @@ def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None: def test_eventpublisher_serialize_failure( test_dir: str, monkeypatch: pytest.MonkeyPatch ) -> None: - """Verify that errors during message serialization are raised to the caller + """Verify that errors during message serialization are raised to the caller. :param test_dir: pytest fixture automatically generating unique working directories for individual test outputs :param monkeypatch: pytest fixture for modifying behavior of existing code - with mock implementations""" + with mock implementations + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -433,15 +477,21 @@ def test_eventpublisher_serialize_failure( ) with monkeypatch.context() as patch: - event = OnCreateConsumer(target_descriptor) + event = OnCreateConsumer( + "test_eventpublisher_serialize_failure", target_descriptor, filters=[] + ) # patch the __bytes__ implementation to cause pickling to fail during send - patch.setattr(event, "__bytes__", lambda x: b"abc") + def bad_bytes(self) -> bytes: + return b"abc" + + # this patch causes an attribute error when event pickling is attempted + patch.setattr(event, "__bytes__", bad_bytes) backbone.notification_channels = (target_descriptor,) # send a message into the channel - with pytest.raises(ValueError) as ex: + with pytest.raises(AttributeError) as ex: publisher.send(event) assert "serialize" in ex.value.args[0] @@ -450,12 +500,13 @@ def test_eventpublisher_serialize_failure( def test_eventpublisher_factory_failure( test_dir: str, monkeypatch: pytest.MonkeyPatch ) -> None: - """Verify that errors during channel construction are raised to the caller + """Verify that errors during channel construction are raised to the caller. :param test_dir: pytest fixture automatically generating unique working directories for individual test outputs :param monkeypatch: pytest fixture for modifying behavior of existing code - with mock implementations""" + with mock implementations + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -471,7 +522,9 @@ def boom(descriptor: str) -> None: publisher = EventBroadcaster(backbone, channel_factory=boom) with monkeypatch.context() as patch: - event = OnCreateConsumer(target_descriptor) + event = OnCreateConsumer( + "test_eventpublisher_factory_failure", target_descriptor, filters=[] + ) backbone.notification_channels = (target_descriptor,) @@ -484,12 +537,13 @@ def boom(descriptor: str) -> None: def test_eventpublisher_failure(test_dir: str, monkeypatch: pytest.MonkeyPatch) -> None: """Verify that unexpected errors during message send are caught and wrapped in a - SmartSimError so they are not propagated directly to the caller + SmartSimError so they are not propagated directly to the caller. :param test_dir: pytest fixture automatically generating unique working directories for individual test outputs :param monkeypatch: pytest fixture for modifying behavior of existing code - with mock implementations""" + with mock implementations + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -507,7 +561,9 @@ def boom(self) -> None: raise Exception("That was unexpected...") with monkeypatch.context() as patch: - event = OnCreateConsumer(target_descriptor) + event = OnCreateConsumer( + "test_eventpublisher_failure", target_descriptor, filters=[] + ) # patch the _broadcast implementation to cause send to fail after # after the event has been pickled @@ -524,10 +580,11 @@ def boom(self) -> None: def test_eventconsumer_receive(test_dir: str) -> None: - """Verify that a consumer retrieves a message from the given channel + """Verify that a consumer retrieves a message from the given channel. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -538,14 +595,16 @@ def test_eventconsumer_receive(test_dir: str) -> None: backbone = BackboneFeatureStore(mock_storage) comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor) - event = OnCreateConsumer(target_descriptor) + event = OnCreateConsumer( + "test_eventconsumer_receive", target_descriptor, filters=[] + ) # simulate a sent event by writing directly to the input comm channel comm_channel.send(bytes(event)) consumer = EventConsumer(comm_channel, backbone) - all_received: t.List[OnCreateConsumer] = consumer.receive() + all_received: t.List[OnCreateConsumer] = consumer.recv() assert len(all_received) == 1 # verify we received the same event that was raised @@ -555,12 +614,13 @@ def test_eventconsumer_receive(test_dir: str) -> None: @pytest.mark.parametrize("num_sent", [0, 1, 2, 4, 8, 16]) def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None: - """Verify that a consumer retrieves multiple message from the given channel + """Verify that a consumer retrieves multiple message from the given channel. :param test_dir: pytest fixture automatically generating unique working directories for individual test outputs :param num_sent: parameterized value used to vary the number of events - that are enqueued and validations are checked at multiple queue sizes""" + that are enqueued and validations are checked at multiple queue sizes + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -574,21 +634,24 @@ def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None: # simulate multiple sent events by writing directly to the input comm channel for _ in range(num_sent): - event = OnCreateConsumer(target_descriptor) + event = OnCreateConsumer( + "test_eventconsumer_receive_multi", target_descriptor, filters=[] + ) comm_channel.send(bytes(event)) consumer = EventConsumer(comm_channel, backbone) - all_received: t.List[OnCreateConsumer] = consumer.receive() + all_received: t.List[OnCreateConsumer] = consumer.recv() assert len(all_received) == num_sent def test_eventconsumer_receive_empty(test_dir: str) -> None: """Verify that a consumer receiving an empty message ignores the - message and continues processing + message and continues processing. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -605,7 +668,7 @@ def test_eventconsumer_receive_empty(test_dir: str) -> None: consumer = EventConsumer(comm_channel, backbone) - messages = consumer.receive() + messages = consumer.recv() # the messages array should be empty assert not messages @@ -616,7 +679,8 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None: multiple publishers and consumers are sending simultaneously. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -628,15 +692,15 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None: capp_channel = FileSystemCommChannel(storage_path / "test-capp") back_channel = FileSystemCommChannel(storage_path / "test-backend") - wmgr_consumer_descriptor = wmgr_channel.descriptor.decode("utf-8") - capp_consumer_descriptor = capp_channel.descriptor.decode("utf-8") - back_consumer_descriptor = back_channel.descriptor.decode("utf-8") + wmgr_consumer_descriptor = wmgr_channel.descriptor + capp_consumer_descriptor = capp_channel.descriptor + back_consumer_descriptor = back_channel.descriptor # create some consumers to receive messages wmgr_consumer = EventConsumer( wmgr_channel, backbone, - filters=[EventCategory.FEATURE_STORE_WRITTEN], + filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN], ) capp_consumer = EventConsumer( capp_channel, @@ -645,7 +709,7 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None: back_consumer = EventConsumer( back_channel, backbone, - filters=[EventCategory.CONSUMER_CREATED], + filters=[OnCreateConsumer.CONSUMER_CREATED], ) # create some broadcasters to publish messages @@ -667,28 +731,38 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None: ] # simulate worker manager sending a notification to backend that it's alive - event_1 = OnCreateConsumer(wmgr_consumer_descriptor) + event_1 = OnCreateConsumer( + "test_eventconsumer_eventpublisher_integration", + wmgr_consumer_descriptor, + filters=[], + ) mock_worker_mgr.send(event_1) # simulate the app updating a model a few times - event_2 = OnWriteFeatureStore(mock_fs_descriptor, "key-1") - event_3 = OnWriteFeatureStore(mock_fs_descriptor, "key-2") - event_4 = OnWriteFeatureStore(mock_fs_descriptor, "key-1") + event_2 = OnWriteFeatureStore( + "test_eventconsumer_eventpublisher_integration", mock_fs_descriptor, "key-1" + ) + event_3 = OnWriteFeatureStore( + "test_eventconsumer_eventpublisher_integration", mock_fs_descriptor, "key-2" + ) + event_4 = OnWriteFeatureStore( + "test_eventconsumer_eventpublisher_integration", mock_fs_descriptor, "key-1" + ) mock_client_app.send(event_2) mock_client_app.send(event_3) mock_client_app.send(event_4) # worker manager should only get updates about feature update - wmgr_messages = wmgr_consumer.receive() + wmgr_messages = wmgr_consumer.recv() assert len(wmgr_messages) == 3 # the backend should only receive messages about consumer creation - back_messages = back_consumer.receive() + back_messages = back_consumer.recv() assert len(back_messages) == 1 # hypothetical app has no filters and will get all events - app_messages = capp_consumer.receive() + app_messages = capp_consumer.recv() assert len(app_messages) == 4 @@ -702,7 +776,8 @@ def test_eventconsumer_batch_timeout( :param invalid_timeout: any invalid timeout that should fail validation :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -713,11 +788,57 @@ def test_eventconsumer_batch_timeout( with pytest.raises(ValueError) as ex: # try to create a consumer w/a max recv size of 0 - EventConsumer( + consumer = EventConsumer( channel, backbone, - filters=[EventCategory.FEATURE_STORE_WRITTEN], - batch_timeout=invalid_timeout, + filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN], ) + consumer.recv(batch_timeout=invalid_timeout) assert "positive" in ex.value.args[0] + + +@pytest.mark.parametrize( + "wait_timeout, exp_wait_max", + [ + # aggregate the 1+1+1 into 3 on remaining parameters + pytest.param(1, 1 + 1 + 1, id="1s wait, 3 cycle steps"), + pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"), + pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"), + pytest.param(9, 3 + 2 + 4 + 8, id="9s wait, 6 cycle steps"), + # aggregate an entire cycle into 16 + pytest.param(19.5, 16 + 3 + 2 + 4, id="20s wait, repeat cycle"), + ], +) +def test_backbone_wait_timeout(wait_timeout: float, exp_wait_max: float) -> None: + """Verify that attempts to attach to the worker queue from the protoclient + timeout in an appropriate amount of time. Note: due to the backoff, we verify + the elapsed time is less than the 15s of a cycle of waits. + + :param wait_timeout: Maximum amount of time (in seconds) to allow the backbone + to wait for the requested value to exist + :param exp_wait_max: Maximum amount of time (in seconds) to set as the upper + bound to allow the delays with backoff to occur + :param storage_for_dragon_fs: the dragon storage engine to use + """ + + # NOTE: exp_wait_time maps to the cycled backoff of [0.1, 0.2, 0.4, 0.8] + # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps) + start_time = time.time() + + storage = {} + backbone = BackboneFeatureStore(storage) + + with pytest.raises(SmartSimError) as ex: + backbone.wait_for(["does-not-exist"], wait_timeout) + + assert "timeout" in str(ex.value.args[0]).lower() + + end_time = time.time() + elapsed = end_time - start_time + + # confirm that we met our timeout + assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}" + + # confirm that the total wait time is aligned with the sleep cycle + assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}" diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py index 59801eebe2..23fdc55ab6 100644 --- a/tests/dragon/test_featurestore_integration.py +++ b/tests/dragon/test_featurestore_integration.py @@ -30,21 +30,17 @@ dragon = pytest.importorskip("dragon") -from smartsim._core.mli.comm.channel.dragon_channel import ( +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_util import ( DEFAULT_CHANNEL_BUFFER_SIZE, - DragonCommChannel, create_local, ) -from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster +from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer +from smartsim._core.mli.infrastructure.comm.event import OnWriteFeatureStore from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, - EventBroadcaster, - EventCategory, - EventConsumer, - OnCreateConsumer, - OnWriteFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict # isort: off from dragon.channels import Channel @@ -59,187 +55,135 @@ pytestmark = pytest.mark.dragon -@pytest.fixture -def storage_for_dragon_fs() -> t.Dict[str, str]: - return dragon_ddict.DDict() - - -def test_eventconsumer_eventpublisher_integration( - storage_for_dragon_fs: t.Any, test_dir: str -) -> None: - """Verify that the publisher and consumer integrate as expected when - multiple publishers and consumers are sending simultaneously. This - test closely tracks the test in tests/test_featurestore.py also named - test_eventconsumer_eventpublisher_integration but requires dragon entities - - :param storage_for_dragon_fs: the dragon storage engine to use - :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" - - mock_storage = storage_for_dragon_fs - backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) - mock_fs_descriptor = backbone.descriptor - - # verify ability to write and read from ddict - backbone["test_dir"] = test_dir - assert backbone["test_dir"] == test_dir - - wmgr_channel_ = Channel.make_process_local() - capp_channel_ = Channel.make_process_local() - back_channel_ = Channel.make_process_local() - +@pytest.fixture(scope="module") +def the_worker_channel() -> DragonCommChannel: + """Fixture to create a valid descriptor for a worker channel + that can be attached to.""" + wmgr_channel_ = create_local() wmgr_channel = DragonCommChannel(wmgr_channel_) - capp_channel = DragonCommChannel(capp_channel_) - back_channel = DragonCommChannel(back_channel_) - - wmgr_consumer_descriptor = wmgr_channel.descriptor_string - capp_consumer_descriptor = capp_channel.descriptor_string - back_consumer_descriptor = back_channel.descriptor_string - - # create some consumers to receive messages - wmgr_consumer = EventConsumer( - wmgr_channel, - backbone, - filters=[EventCategory.FEATURE_STORE_WRITTEN], - ) - capp_consumer = EventConsumer( - capp_channel, - backbone, - ) - back_consumer = EventConsumer( - back_channel, - backbone, - filters=[EventCategory.CONSUMER_CREATED], - ) - - # create some broadcasters to publish messages - mock_worker_mgr = EventBroadcaster( - backbone, - channel_factory=DragonCommChannel.from_descriptor, - ) - mock_client_app = EventBroadcaster( - backbone, - channel_factory=DragonCommChannel.from_descriptor, - ) - - # register all of the consumers even though the OnCreateConsumer really should - # trigger its registration. event processing is tested elsewhere. - backbone.notification_channels = [ - wmgr_consumer_descriptor, - capp_consumer_descriptor, - back_consumer_descriptor, - ] - - # simulate worker manager sending a notification to backend that it's alive - event_1 = OnCreateConsumer(wmgr_consumer_descriptor) - mock_worker_mgr.send(event_1) - - # simulate the app updating a model a few times - for key in ["key-1", "key-2", "key-1"]: - event = OnWriteFeatureStore(backbone.descriptor, key) - mock_client_app.send(event, timeout=0.1) - - # worker manager should only get updates about feature update - wmgr_messages = wmgr_consumer.receive() - assert len(wmgr_messages) == 3 - - # the backend should only receive messages about consumer creation - back_messages = back_consumer.receive() - assert len(back_messages) == 1 - - # hypothetical app has no filters and will get all events - app_messages = capp_consumer.receive() - assert len(app_messages) == 4 + return wmgr_channel @pytest.mark.parametrize( - "num_events, batch_timeout", + "num_events, batch_timeout, max_batches_expected", [ - pytest.param(1, 1.0, id="under 1s timeout"), - pytest.param(20, 1.0, id="test 1s timeout w/20"), - pytest.param(50, 1.0, id="test 1s timeout w/50"), - pytest.param(60, 0.1, id="small batches"), - pytest.param(100, 0.1, id="many small batches"), + pytest.param(1, 1.0, 2, id="under 1s timeout"), + pytest.param(20, 1.0, 3, id="test 1s timeout 20x"), + pytest.param(30, 0.2, 5, id="test 0.2s timeout 30x"), + pytest.param(60, 0.4, 4, id="small batches"), + pytest.param(100, 0.1, 10, id="many small batches"), ], ) def test_eventconsumer_max_dequeue( num_events: int, batch_timeout: float, - storage_for_dragon_fs: t.Any, + max_batches_expected: int, + the_worker_channel: DragonCommChannel, + the_backbone: BackboneFeatureStore, ) -> None: """Verify that a consumer does not sit and collect messages indefinitely - by checking that a consumer returns after a maximum timeout is exceeded - - :param num_events: the total number of events to raise in the test - :param batch_timeout: the maximum wait time for a message to be sent. - :param storage_for_dragon_fs: the dragon storage engine to use""" + by checking that a consumer returns after a maximum timeout is exceeded. - mock_storage = storage_for_dragon_fs - backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) - - wmgr_channel_ = Channel.make_process_local() - wmgr_channel = DragonCommChannel(wmgr_channel_) - wmgr_consumer_descriptor = wmgr_channel.descriptor_string + :param num_events: Total number of events to raise in the test + :param batch_timeout: Maximum wait time (in seconds) for a message to be sent + :param max_batches_expected: Maximum number of receives that should occur + :param the_storage: Dragon storage engine to use + """ # create some consumers to receive messages wmgr_consumer = EventConsumer( - wmgr_channel, - backbone, - filters=[EventCategory.FEATURE_STORE_WRITTEN], - batch_timeout=batch_timeout, + the_worker_channel, + the_backbone, + filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN], ) # create a broadcaster to publish messages mock_client_app = EventBroadcaster( - backbone, + the_backbone, channel_factory=DragonCommChannel.from_descriptor, ) # register all of the consumers even though the OnCreateConsumer really should # trigger its registration. event processing is tested elsewhere. - backbone.notification_channels = [wmgr_consumer_descriptor] + the_backbone.notification_channels = [the_worker_channel.descriptor] # simulate the app updating a model a lot of times for key in (f"key-{i}" for i in range(num_events)): - event = OnWriteFeatureStore(backbone.descriptor, key) - mock_client_app.send(event, timeout=0.1) + event = OnWriteFeatureStore( + "test_eventconsumer_max_dequeue", the_backbone.descriptor, key + ) + mock_client_app.send(event, timeout=0.01) num_dequeued = 0 + num_batches = 0 - while wmgr_messages := wmgr_consumer.receive(timeout=0.01): + while wmgr_messages := wmgr_consumer.recv( + timeout=0.1, + batch_timeout=batch_timeout, + ): # worker manager should not get more than `max_num_msgs` events num_dequeued += len(wmgr_messages) + num_batches += 1 # make sure we made all the expected dequeue calls and got everything assert num_dequeued == num_events + assert num_batches > 0 + assert num_batches < max_batches_expected, "too many recv calls were made" @pytest.mark.parametrize( "buffer_size", [ - pytest.param(-1, id="use default: 500"), - pytest.param(0, id="use default: 500"), - pytest.param(1, id="non-zero buffer size: 1"), - pytest.param(500, id="buffer size: 500"), - pytest.param(1000, id="buffer size: 1000"), + pytest.param( + -1, + id="replace negative, default to 500", + marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"), + ), + pytest.param( + 0, + id="replace zero, default to 500", + marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"), + ), + pytest.param( + 1, + id="non-zero buffer size: 1", + marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"), + ), + # pytest.param(500, id="maximum size edge case: 500"), + pytest.param( + 550, + id="larger than default: 550", + marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"), + ), + pytest.param( + 800, + id="much larger then default: 800", + marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"), + ), + pytest.param( + 1000, + id="very large buffer: 1000, unreliable in dragon-v0.10", + marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"), + ), ], ) def test_channel_buffer_size( buffer_size: int, - storage_for_dragon_fs: t.Any, + the_storage: t.Any, ) -> None: """Verify that a channel used by an EventBroadcaster can buffer messages until a configured maximum value is exceeded. - :param buffer_size: the maximum number of messages allowed in a channel buffer - :param storage_for_dragon_fs: the dragon storage engine to use""" + :param buffer_size: Maximum number of messages allowed in a channel buffer + :param the_storage: The dragon storage engine to use + """ - mock_storage = storage_for_dragon_fs + mock_storage = the_storage backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) wmgr_channel_ = create_local(buffer_size) # <--- vary buffer size wmgr_channel = DragonCommChannel(wmgr_channel_) - wmgr_consumer_descriptor = wmgr_channel.descriptor_string + wmgr_consumer_descriptor = wmgr_channel.descriptor # create a broadcaster to publish messages. create no consumers to # push the number of sent messages past the allotted buffer size @@ -259,9 +203,11 @@ def test_channel_buffer_size( # simulate the app updating a model a lot of times for key in (f"key-{i}" for i in range(buffer_size)): - event = OnWriteFeatureStore(backbone.descriptor, key) - mock_client_app.send(event, timeout=0.1) + event = OnWriteFeatureStore( + "test_channel_buffer_size", backbone.descriptor, key + ) + mock_client_app.send(event, timeout=0.01) # adding 1 more over the configured buffer size should report the error with pytest.raises(Exception) as ex: - mock_client_app.send(event, timeout=0.1) + mock_client_app.send(event, timeout=0.01) diff --git a/tests/dragon/test_inference_reply.py b/tests/dragon/test_inference_reply.py index 1eb137ae61..bdc7be14bc 100644 --- a/tests/dragon/test_inference_reply.py +++ b/tests/dragon/test_inference_reply.py @@ -28,7 +28,7 @@ dragon = pytest.importorskip("dragon") -from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey +from smartsim._core.mli.infrastructure.storage.feature_store import TensorKey from smartsim._core.mli.infrastructure.worker.worker import InferenceReply from smartsim._core.mli.message_handler import MessageHandler @@ -44,8 +44,8 @@ def inference_reply() -> InferenceReply: @pytest.fixture -def fs_key() -> FeatureStoreKey: - return FeatureStoreKey("key", "descriptor") +def fs_key() -> TensorKey: + return TensorKey("key", "descriptor") @pytest.mark.parametrize( diff --git a/tests/dragon/test_inference_request.py b/tests/dragon/test_inference_request.py index 909d021d6e..f5c8b9bdc7 100644 --- a/tests/dragon/test_inference_request.py +++ b/tests/dragon/test_inference_request.py @@ -28,7 +28,7 @@ dragon = pytest.importorskip("dragon") -from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey +from smartsim._core.mli.infrastructure.storage.feature_store import TensorKey from smartsim._core.mli.infrastructure.worker.worker import InferenceRequest from smartsim._core.mli.message_handler import MessageHandler @@ -44,8 +44,8 @@ def inference_request() -> InferenceRequest: @pytest.fixture -def fs_key() -> FeatureStoreKey: - return FeatureStoreKey("key", "descriptor") +def fs_key() -> TensorKey: + return TensorKey("key", "descriptor") @pytest.mark.parametrize( diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py new file mode 100644 index 0000000000..f84417107d --- /dev/null +++ b/tests/dragon/test_protoclient.py @@ -0,0 +1,313 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import pickle +import time +import typing as t +from unittest.mock import MagicMock + +import pytest + +dragon = pytest.importorskip("dragon") + +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local +from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster +from smartsim._core.mli.infrastructure.comm.event import OnWriteFeatureStore +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +# isort: off +from dragon import fli +from dragon.data.ddict.ddict import DDict + +# from ..ex..high_throughput_inference.mock_app import ProtoClient +from smartsim._core.mli.client.protoclient import ProtoClient + + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon +WORK_QUEUE_KEY = BackboneFeatureStore.MLI_WORKER_QUEUE +logger = get_logger(__name__) + + +@pytest.fixture(scope="module") +def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel: + """Fixture that creates a dragon FLI channel as a stand-in for the + worker queue created by the worker. + + :param the_backbone: The backbone feature store to update + with the worker queue descriptor. + :returns: The attached `DragonFLIChannel` + """ + + # create the FLI + to_worker_channel = create_local() + fli_ = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) + comm_channel = DragonFLIChannel(fli_) + + # store the descriptor in the backbone + the_backbone.worker_queue = comm_channel.descriptor + + try: + comm_channel.send(b"foo") + except Exception as ex: + logger.exception(f"Test send from worker channel failed", exc_info=True) + + return comm_channel + + +@pytest.mark.parametrize( + "backbone_timeout, exp_wait_max", + [ + # aggregate the 1+1+1 into 3 on remaining parameters + pytest.param(0.5, 1 + 1 + 1, id="0.5s wait, 3 cycle steps"), + pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"), + pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"), + ], +) +def test_protoclient_timeout( + backbone_timeout: float, + exp_wait_max: float, + the_backbone: BackboneFeatureStore, + monkeypatch: pytest.MonkeyPatch, +): + """Verify that attempts to attach to the worker queue from the protoclient + timeout in an appropriate amount of time. Note: due to the backoff, we verify + the elapsed time is less than the 15s of a cycle of waits. + + :param backbone_timeout: a timeout for use when configuring a proto client + :param exp_wait_max: a ceiling for the expected time spent waiting for + the timeout + :param the_backbone: a pre-initialized backbone featurestore for setting up + the environment variable required by the client + """ + + # NOTE: exp_wait_time maps to the cycled backoff of [0.1, 0.2, 0.4, 0.8] + # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps) + + with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex: + start_time = time.time() + # remove the worker queue value from the backbone if it exists + # to ensure the timeout occurs + the_backbone.pop(BackboneFeatureStore.MLI_WORKER_QUEUE) + + ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor) + + ProtoClient(timing_on=False, backbone_timeout=backbone_timeout) + elapsed = time.time() - start_time + logger.info(f"ProtoClient timeout occurred in {elapsed} seconds") + + # confirm that we met our timeout + assert ( + elapsed >= backbone_timeout + ), f"below configured timeout {backbone_timeout}" + + # confirm that the total wait time is aligned with the sleep cycle + assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}" + + +def test_protoclient_initialization_no_backbone( + monkeypatch: pytest.MonkeyPatch, the_worker_queue: DragonFLIChannel +): + """Verify that attempting to start the client without required environment variables + results in an exception. + + :param the_worker_queue: Passing the worker queue fixture to ensure + the worker queue environment is correctly configured. + + NOTE: os.environ[BackboneFeatureStore.MLI_BACKBONE] is not set""" + + with monkeypatch.context() as patch, pytest.raises(SmartSimError) as ex: + patch.setenv(BackboneFeatureStore.MLI_BACKBONE, "") + + ProtoClient(timing_on=False) + + # confirm the missing value error has been raised + assert {"backbone", "configuration"}.issubset(set(ex.value.args[0].split(" "))) + + +def test_protoclient_initialization( + the_backbone: BackboneFeatureStore, + the_worker_queue: DragonFLIChannel, + monkeypatch: pytest.MonkeyPatch, +): + """Verify that attempting to start the client with required env vars results + in a fully initialized client. + + :param the_backbone: a pre-initialized backbone featurestore + :param the_worker_queue: an FLI channel the client will retrieve + from the backbone""" + + with monkeypatch.context() as ctx: + ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor) + # NOTE: rely on `the_worker_queue` fixture to put MLI_WORKER_QUEUE in backbone + + client = ProtoClient(timing_on=False) + + fs_descriptor = the_backbone.descriptor + wq_descriptor = the_worker_queue.descriptor + + # confirm the backbone was attached correctly + assert client._backbone is not None + assert client._backbone.descriptor == fs_descriptor + + # we expect the backbone to add its descriptor to the local env + assert os.environ[BackboneFeatureStore.MLI_BACKBONE] == fs_descriptor + + # confirm the worker queue is created and attached correctly + assert client._to_worker_fli is not None + assert client._to_worker_fli.descriptor == wq_descriptor + + # we expect the worker queue descriptor to be placed into the backbone + # we do NOT expect _from_worker_ch to be placed anywhere. it's a specific callback + assert the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] == wq_descriptor + + # confirm the worker channels are created + assert client._from_worker_ch is not None + assert client._to_worker_ch is not None + + # wrap the channels just to easily verify they produces a descriptor + assert DragonCommChannel(client._from_worker_ch).descriptor + assert DragonCommChannel(client._to_worker_ch).descriptor + + # confirm a publisher is created + assert client._publisher is not None + + +def test_protoclient_write_model( + the_backbone: BackboneFeatureStore, + the_worker_queue: DragonFLIChannel, + monkeypatch: pytest.MonkeyPatch, +): + """Verify that writing a model using the client causes the model data to be + written to a feature store. + + :param the_backbone: a pre-initialized backbone featurestore + :param the_worker_queue: Passing the worker queue fixture to ensure + the worker queue environment is correctly configured. + from the backbone + """ + + with monkeypatch.context() as ctx: + # we won't actually send here + client = ProtoClient(timing_on=False) + + ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor) + # NOTE: rely on `the_worker_queue` fixture to put MLI_WORKER_QUEUE in backbone + + client = ProtoClient(timing_on=False) + + model_key = "my-model" + model_bytes = b"12345" + + client.set_model(model_key, model_bytes) + + # confirm the client modified the underlying feature store + assert client._backbone[model_key] == model_bytes + + +@pytest.mark.parametrize( + "num_listeners, num_model_updates", + [(1, 1), (1, 4), (2, 4), (16, 4), (64, 8)], +) +def test_protoclient_write_model_notification_sent( + the_backbone: BackboneFeatureStore, + the_worker_queue: DragonFLIChannel, + monkeypatch: pytest.MonkeyPatch, + num_listeners: int, + num_model_updates: int, +): + """Verify that writing a model sends a key-written event. + + :param the_backbone: a pre-initialized backbone featurestore + :param the_worker_queue: an FLI channel the client will retrieve + from the backbone + :param num_listeners: vary the number of registered listeners + to verify that the event is broadcast to everyone + :param num_listeners: vary the number of listeners to register + to verify the broadcast counts messages sent correctly + """ + + # we won't actually send here, but it won't try without registered listeners + listeners = [f"mock-ch-desc-{i}" for i in range(num_listeners)] + + the_backbone[BackboneFeatureStore.MLI_BACKBONE] = the_backbone.descriptor + the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_queue.descriptor + the_backbone[BackboneFeatureStore.MLI_NOTIFY_CONSUMERS] = ",".join(listeners) + the_backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] = None + + with monkeypatch.context() as ctx: + ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor) + # NOTE: rely on `the_worker_queue` fixture to put MLI_WORKER_QUEUE in backbone + + client = ProtoClient(timing_on=False) + + publisher = t.cast(EventBroadcaster, client._publisher) + + # mock attaching to a channel given the mock-ch-desc in backbone + mock_send = MagicMock(return_value=None) + mock_comm_channel = MagicMock(**{"send": mock_send}, spec=DragonCommChannel) + mock_get_comm_channel = MagicMock(return_value=mock_comm_channel) + ctx.setattr(publisher, "_get_comm_channel", mock_get_comm_channel) + + model_key = "my-model" + model_bytes = b"12345" + + for i in range(num_model_updates): + client.set_model(model_key, model_bytes) + + # confirm that a listener channel was attached + # once for each registered listener in backbone + assert mock_get_comm_channel.call_count == num_listeners * num_model_updates + + # confirm the client raised the key-written event + assert ( + mock_send.call_count == num_listeners * num_model_updates + ), f"Expected {num_listeners} sends with {num_listeners} registrations" + + # with at least 1 consumer registered, we can verify the message is sent + for call_args in mock_send.call_args_list: + send_args = call_args.args + event_bytes, timeout = send_args[0], send_args[1] + + assert event_bytes, "Expected event bytes to be supplied to send" + assert ( + timeout == 0.001 + ), "Expected default timeout on call to `publisher.send`, " + + # confirm the correct event was raised + event = t.cast( + OnWriteFeatureStore, + pickle.loads(event_bytes), + ) + assert event.descriptor == the_backbone.descriptor + assert event.key == model_key diff --git a/tests/dragon/test_reply_building.py b/tests/dragon/test_reply_building.py index 063200dd64..48493b3c4d 100644 --- a/tests/dragon/test_reply_building.py +++ b/tests/dragon/test_reply_building.py @@ -31,7 +31,6 @@ dragon = pytest.importorskip("dragon") from smartsim._core.mli.infrastructure.control.worker_manager import build_failure_reply -from smartsim._core.mli.infrastructure.worker.worker import InferenceReply if t.TYPE_CHECKING: from smartsim._core.mli.mli_schemas.response.response_capnp import Status diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index ccdbce58c3..70d73e243f 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -25,10 +25,8 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import gc -import io -import logging -import pathlib -import socket +import os +import subprocess as sp import time import typing as t from queue import Empty @@ -36,33 +34,27 @@ import numpy as np import pytest -torch = pytest.importorskip("torch") -dragon = pytest.importorskip("dragon") +from . import conftest +from .utils import msg_pump + +pytest.importorskip("dragon") + -import base64 +# isort: off +import dragon import multiprocessing as mp -try: - mp.set_start_method("dragon") -except Exception: - pass +import torch -import os +# isort: on -import dragon.channels as dch -import dragon.infrastructure.policy as dragon_policy -import dragon.infrastructure.process_desc as dragon_process_desc -import dragon.native.process as dragon_process from dragon import fli -from dragon.channels import Channel from dragon.data.ddict.ddict import DDict -from dragon.managed_memory import MemoryAlloc, MemoryPool -from dragon.mpbridge.queues import DragonQueue +from dragon.managed_memory import MemoryAlloc -from smartsim._core.entrypoints.service import Service -from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.control.request_dispatcher import ( RequestBatch, RequestDispatcher, @@ -70,210 +62,122 @@ from smartsim._core.mli.infrastructure.control.worker_manager import ( EnvironmentConfigLoader, ) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker -from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger -from .feature_store import FileSystemFeatureStore -from .utils.channel import FileSystemCommChannel - logger = get_logger(__name__) + # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon -def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: - """Create a simple torch model and persist to disk for - testing purposes. - - TODO: remove once unit tests are in place""" - # test_path = pathlib.Path(work_dir) - if not model_path.parent.exists(): - model_path.parent.mkdir(parents=True, exist_ok=True) - - model_path.unlink(missing_ok=True) - - model = torch.nn.Linear(2, 1) - torch.save(model, model_path) - - return model_path +try: + mp.set_start_method("dragon") +except Exception: + pass -def mock_messages( - request_dispatcher_queue: DragonFLIChannel, - feature_store: FeatureStore, - feature_store_root_dir: pathlib.Path, - comm_channel_root_dir: pathlib.Path, +@pytest.mark.parametrize("num_iterations", [4]) +def test_request_dispatcher( + num_iterations: int, + the_storage: DDict, + test_dir: str, ) -> None: - """Mock event producer for triggering the inference pipeline""" - feature_store_root_dir.mkdir(parents=True, exist_ok=True) - comm_channel_root_dir.mkdir(parents=True, exist_ok=True) - - model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt") - model_bytes = model_path.read_bytes() - model_key = str(feature_store_root_dir / "model_fs.pt") - - feature_store[model_key] = model_bytes - - for iteration_number in range(2): - - channel = Channel.make_process_local() - callback_channel = DragonCommChannel(channel) - - input_path = feature_store_root_dir / f"{iteration_number}/input.pt" - output_path = feature_store_root_dir / f"{iteration_number}/output.pt" - - input_key = str(input_path) - output_key = str(output_path) - - tensor = ( - (iteration_number + 1) * torch.ones((1, 2), dtype=torch.float32) - ).numpy() - fsd = feature_store.descriptor - - tensor_desc = MessageHandler.build_tensor_descriptor( - "c", "float32", list(tensor.shape) - ) - - message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) - message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd) - message_model_key = MessageHandler.build_model_key(model_key, fsd) - - request = MessageHandler.build_request( - reply_channel=base64.b64encode(channel.serialize()).decode("utf-8"), - model=message_model_key, - inputs=[tensor_desc], - outputs=[message_tensor_output_key], - output_descriptors=[], - custom_attributes=None, - ) - request_bytes = MessageHandler.serialize_request(request) - with request_dispatcher_queue._fli.sendh( - timeout=None, stream_channel=request_dispatcher_queue._channel - ) as sendh: - sendh.send_bytes(request_bytes) - sendh.send_bytes(tensor.tobytes()) - time.sleep(1) - - -@pytest.fixture -def prepare_environment(test_dir: str) -> pathlib.Path: - """Cleanup prior outputs to run demo repeatedly""" - path = pathlib.Path(f"{test_dir}/workermanager.log") - logging.basicConfig(filename=path.absolute(), level=logging.DEBUG) - return path - - -def service_as_dragon_proc( - service: Service, cpu_affinity: list[int], gpu_affinity: list[int] -) -> dragon_process.Process: - - options = dragon_process_desc.ProcessOptions(make_inf_channels=True) - local_policy = dragon_policy.Policy( - placement=dragon_policy.Policy.Placement.HOST_NAME, - host_name=socket.gethostname(), - cpu_affinity=cpu_affinity, - gpu_affinity=gpu_affinity, - ) - return dragon_process.Process( - target=service.execute, - args=[], - cwd=os.getcwd(), - policy=local_policy, - options=options, - stderr=dragon_process.Popen.STDOUT, - stdout=dragon_process.Popen.STDOUT, - ) - - -def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: """Test the request dispatcher batching and queueing system This also includes setting a queue to disposable, checking that it is no longer referenced by the dispatcher. """ - test_path = prepare_environment - fs_path = test_path / "feature_store" - comm_path = test_path / "comm_store" - - to_worker_channel = dch.Channel.make_process_local() + to_worker_channel = create_local() to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - to_worker_fli_serialized = to_worker_fli.serialize() + to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli) + + backbone_fs = BackboneFeatureStore(the_storage, allow_reserved_writes=True) # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader # or test environment may be unable to send messages w/queue - descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") - os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor - - ddict = DDict(1, 2, 4 * 1024**2) - dragon_fs = DragonFeatureStore(ddict) + os.environ[BackboneFeatureStore.MLI_WORKER_QUEUE] = to_worker_fli_comm_ch.descriptor + os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone_fs.descriptor config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=DragonCommChannel.from_descriptor, queue_factory=DragonFLIChannel.from_descriptor, ) - integrated_worker_type = TorchWorker request_dispatcher = RequestDispatcher( - batch_timeout=0, + batch_timeout=1000, batch_size=2, config_loader=config_loader, - worker_type=integrated_worker_type, + worker_type=TorchWorker, mem_pool_size=2 * 1024**2, ) worker_queue = config_loader.get_queue() if worker_queue is None: - logger.warn( + logger.warning( "FLI input queue not loaded correctly from config_loader: " f"{config_loader._queue_descriptor}" ) request_dispatcher._on_start() - for _ in range(2): + # put some messages into the work queue for the dispatcher to pickup + channels = [] + processes = [] + for i in range(num_iterations): batch: t.Optional[RequestBatch] = None mem_allocs = [] tensors = [] - fs_path = test_path / f"feature_store" - comm_path = test_path / f"comm_store" - model_key = str(fs_path / "model_fs.pt") - - # create a mock client application to populate the request queue - msg_pump = mp.Process( - target=mock_messages, - args=( - worker_queue, - dragon_fs, - fs_path, - comm_path, - ), - ) - - msg_pump.start() - time.sleep(1) + # NOTE: creating callbacks in test to avoid a local channel being torn + # down when mock_messages terms but before the final response message is sent + + callback_channel = DragonCommChannel.from_local() + channels.append(callback_channel) + + process = conftest.function_as_dragon_proc( + msg_pump.mock_messages, + [ + worker_queue.descriptor, + backbone_fs.descriptor, + i, + callback_channel.descriptor, + ], + [], + [], + ) + processes.append(process) + process.start() + assert process.returncode is None, "The message pump failed to start" - for attempts in range(15): + # give dragon some time to populate the message queues + for i in range(15): try: request_dispatcher._on_iteration() - batch = request_dispatcher.task_queue.get(timeout=1) + batch = request_dispatcher.task_queue.get(timeout=1.0) break except Empty: + time.sleep(2) + logger.warning(f"Task queue is empty on iteration {i}") continue except Exception as exc: + logger.error(f"Task queue exception on iteration {i}") raise exc - try: - assert batch is not None - assert batch.has_valid_requests + assert batch is not None + assert batch.has_valid_requests + + model_key = batch.model_id.key + try: transform_result = batch.inputs for transformed, dims, dtype in zip( transform_result.transformed, @@ -316,8 +220,6 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: for mem_alloc in mem_allocs: mem_alloc.free() - msg_pump.kill() - request_dispatcher._active_queues[model_key].make_disposable() assert request_dispatcher._active_queues[model_key].can_be_removed diff --git a/tests/dragon/test_torch_worker.py b/tests/dragon/test_torch_worker.py index 9a5ed6309f..2a9e7d01bd 100644 --- a/tests/dragon/test_torch_worker.py +++ b/tests/dragon/test_torch_worker.py @@ -37,7 +37,7 @@ from torch import nn from torch.nn import functional as F -from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey +from smartsim._core.mli.infrastructure.storage.feature_store import ModelKey from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, @@ -109,7 +109,7 @@ def get_request() -> InferenceRequest: ] return InferenceRequest( - model_key=FeatureStoreKey(key="model", descriptor="xyz"), + model_key=ModelKey(key="model", descriptor="xyz"), callback=None, raw_inputs=tensor_numpy, input_keys=None, diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index 1ebc512a50..4047a731fc 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -34,7 +34,6 @@ torch = pytest.importorskip("torch") dragon = pytest.importorskip("dragon") -import base64 import multiprocessing as mp try: @@ -44,25 +43,26 @@ import os -import dragon.channels as dch +import torch.nn as nn from dragon import fli -from dragon.mpbridge.queues import DragonQueue -from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.control.worker_manager import ( EnvironmentConfigLoader, WorkerManager, ) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore +from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger -from .feature_store import FileSystemFeatureStore from .utils.channel import FileSystemCommChannel logger = get_logger(__name__) @@ -70,111 +70,205 @@ pytestmark = pytest.mark.dragon -def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: +class MiniModel(nn.Module): + """A torch model that can be executed by the default torch worker""" + + def __init__(self): + """Initialize the model.""" + super().__init__() + + self._name = "mini-model" + self._net = torch.nn.Linear(2, 1) + + def forward(self, input): + """Execute a forward pass.""" + return self._net(input) + + @property + def bytes(self) -> bytes: + """Retrieve the serialized model + + :returns: The byte stream of the model file + """ + buffer = io.BytesIO() + scripted = torch.jit.trace(self._net, self.get_batch()) + torch.jit.save(scripted, buffer) + return buffer.getvalue() + + @classmethod + def get_batch(cls) -> "torch.Tensor": + """Generate a single batch of data with the correct + shape for inference. + + :returns: The batch as a torch tensor + """ + return torch.randn((100, 2), dtype=torch.float32) + + +def create_model(model_path: pathlib.Path) -> pathlib.Path: """Create a simple torch model and persist to disk for testing purposes. - TODO: remove once unit tests are in place""" - # test_path = pathlib.Path(work_dir) + :param model_path: The path to the torch model file + """ if not model_path.parent.exists(): model_path.parent.mkdir(parents=True, exist_ok=True) model_path.unlink(missing_ok=True) - # model_path = test_path / "basic.pt" - model = torch.nn.Linear(2, 1) - torch.save(model, model_path) + mini_model = MiniModel() + torch.save(mini_model, model_path) return model_path +def load_model() -> bytes: + """Create a simple torch model in memory for testing.""" + mini_model = MiniModel() + return mini_model.bytes + + def mock_messages( - worker_manager_queue: CommChannelBase, - feature_store: FeatureStore, feature_store_root_dir: pathlib.Path, comm_channel_root_dir: pathlib.Path, + kill_queue: mp.Queue, ) -> None: - """Mock event producer for triggering the inference pipeline""" + """Mock event producer for triggering the inference pipeline. + + :param feature_store_root_dir: Path to a directory where a + FileSystemFeatureStore can read & write results + :param comm_channel_root_dir: Path to a directory where a + FileSystemCommChannel can read & write messages + :param kill_queue: Queue used by unit test to stop mock_message process + """ feature_store_root_dir.mkdir(parents=True, exist_ok=True) comm_channel_root_dir.mkdir(parents=True, exist_ok=True) - model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt") - model_bytes = model_path.read_bytes() - model_key = str(feature_store_root_dir / "model_fs.pt") + iteration_number = 0 + + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) + backbone = config_loader.get_backbone() - feature_store[model_key] = model_bytes + worker_queue = config_loader.get_queue() + if worker_queue is None: + queue_desc = config_loader._queue_descriptor + logger.warn( + f"FLI input queue not loaded correctly from config_loader: {queue_desc}" + ) - iteration_number = 0 + model_key = "mini-model" + model_bytes = load_model() + backbone[model_key] = model_bytes while True: + if not kill_queue.empty(): + return iteration_number += 1 time.sleep(1) - # 1. for demo, ignore upstream and just put stuff into downstream - # 2. for demo, only one downstream but we'd normally have to filter - # msg content and send to the correct downstream (worker) queue - # timestamp = time.time_ns() - # mock_channel = test_path / f"brainstorm-{timestamp}.txt" - # mock_channel.touch() - - # thread - just look for key (wait for keys) - # call checkpoint, try to get non-persistent key, it blocks - # working set size > 1 has side-effects - # only incurs cost when working set size has been exceeded channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt" callback_channel = FileSystemCommChannel(pathlib.Path(channel_key)) - input_path = feature_store_root_dir / f"{iteration_number}/input.pt" - output_path = feature_store_root_dir / f"{iteration_number}/output.pt" + batch = MiniModel.get_batch() + shape = batch.shape + batch_bytes = batch.numpy().tobytes() - input_key = str(input_path) - output_key = str(output_path) + logger.debug(f"Model content: {backbone[model_key][:20]}") - buffer = io.BytesIO() - tensor = torch.randn((1, 2), dtype=torch.float32) - torch.save(tensor, buffer) - feature_store[input_key] = buffer.getvalue() - fsd = feature_store.descriptor - - message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) - message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd) - message_model_key = MessageHandler.build_model_key(model_key, fsd) + input_descriptor = MessageHandler.build_tensor_descriptor( + "f", "float32", list(shape) + ) + # The first request is always the metadata... request = MessageHandler.build_request( reply_channel=callback_channel.descriptor, - model=message_model_key, - inputs=[message_tensor_input_key], - outputs=[message_tensor_output_key], + model=MessageHandler.build_model(model_bytes, "mini-model", "1.0"), + inputs=[input_descriptor], + outputs=[], output_descriptors=[], custom_attributes=None, ) request_bytes = MessageHandler.serialize_request(request) - worker_manager_queue.send(request_bytes) + fli: DragonFLIChannel = worker_queue + + with fli._fli.sendh(timeout=None, stream_channel=fli._channel) as sendh: + sendh.send_bytes(request_bytes) + sendh.send_bytes(batch_bytes) + + logger.info("published message") + + if iteration_number > 5: + return + + +def mock_mli_infrastructure_mgr() -> None: + """Create resources normally instanatiated by the infrastructure + management portion of the DragonBackend. + """ + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) + + integrated_worker = TorchWorker + + worker_manager = WorkerManager( + config_loader, + integrated_worker, + as_service=True, + cooldown=10, + device="cpu", + dispatcher_queue=mp.Queue(maxsize=0), + ) + worker_manager.execute() @pytest.fixture def prepare_environment(test_dir: str) -> pathlib.Path: - """Cleanup prior outputs to run demo repeatedly""" + """Cleanup prior outputs to run demo repeatedly. + + :param test_dir: the directory to prepare + :returns: The path to the log file + """ path = pathlib.Path(f"{test_dir}/workermanager.log") logging.basicConfig(filename=path.absolute(), level=logging.DEBUG) return path def test_worker_manager(prepare_environment: pathlib.Path) -> None: - """Test the worker manager""" + """Test the worker manager. + + :param prepare_environment: Pass this fixture to configure + global resources before the worker manager executes + """ test_path = prepare_environment fs_path = test_path / "feature_store" comm_path = test_path / "comm_store" - to_worker_channel = dch.Channel.make_process_local() + mgr_per_node = 1 + num_nodes = 2 + mem_per_node = 128 * 1024**2 + + storage = create_ddict(num_nodes, mgr_per_node, mem_per_node) + backbone = BackboneFeatureStore(storage, allow_reserved_writes=True) + + to_worker_channel = create_local() to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - to_worker_fli_serialized = to_worker_fli.serialize() - # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader + to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli) + + # NOTE: env vars must be set prior to instantiating EnvironmentConfigLoader # or test environment may be unable to send messages w/queue - descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") - os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor + os.environ[BackboneFeatureStore.MLI_WORKER_QUEUE] = ( + to_worker_fli_comm_channel.descriptor + ) + os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone.descriptor config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -197,22 +291,24 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: logger.warn( f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}" ) + backbone.worker_queue = to_worker_fli_comm_channel.descriptor # create a mock client application to populate the request queue + kill_queue = mp.Queue() msg_pump = mp.Process( target=mock_messages, - args=( - worker_queue, - FileSystemFeatureStore(fs_path), - fs_path, - comm_path, - ), + args=(fs_path, comm_path, kill_queue), ) msg_pump.start() # create a process to execute commands - process = mp.Process(target=worker_manager.execute) + process = mp.Process(target=mock_mli_infrastructure_mgr) + + # let it send some messages before starting the worker manager + msg_pump.join(timeout=5) process.start() + msg_pump.join(timeout=5) + kill_queue.put_nowait("kill!") process.join(timeout=5) - process.kill() msg_pump.kill() + process.kill() diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py index 6cde6258f2..4c46359c2d 100644 --- a/tests/dragon/utils/channel.py +++ b/tests/dragon/utils/channel.py @@ -39,17 +39,15 @@ class FileSystemCommChannel(CommChannelBase): """Passes messages by writing to a file""" - def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: - """Initialize the FileSystemCommChannel instance + def __init__(self, key: pathlib.Path) -> None: + """Initialize the FileSystemCommChannel instance. - :param key: a path to the root directory of the feature store""" + :param key: a path to the root directory of the feature store + """ self._lock = threading.RLock() - if not isinstance(key, bytes): - super().__init__(key.as_posix().encode("utf-8")) - self._file_path = key - else: - super().__init__(key) - self._file_path = pathlib.Path(key.decode("utf-8")) + + super().__init__(key.as_posix()) + self._file_path = key if not self._file_path.parent.exists(): self._file_path.parent.mkdir(parents=True) @@ -57,10 +55,11 @@ def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: self._file_path.touch() def send(self, value: bytes, timeout: float = 0) -> None: - """Send a message throuh the underlying communication channel + """Send a message throuh the underlying communication channel. + :param value: The value to send :param timeout: maximum time to wait (in seconds) for messages to send - :param value: The value to send""" + """ with self._lock: # write as text so we can add newlines as delimiters with open(self._file_path, "a") as fp: @@ -69,11 +68,12 @@ def send(self, value: bytes, timeout: float = 0) -> None: logger.debug(f"FileSystemCommChannel {self._file_path} sent message") def recv(self, timeout: float = 0) -> t.List[bytes]: - """Receives message(s) through the underlying communication channel + """Receives message(s) through the underlying communication channel. :param timeout: maximum time to wait (in seconds) for messages to arrive :returns: the received message - :raises SmartSimError: if the descriptor points to a missing file""" + :raises SmartSimError: if the descriptor points to a missing file + """ with self._lock: messages: t.List[bytes] = [] if not self._file_path.exists(): @@ -102,7 +102,7 @@ def recv(self, timeout: float = 0) -> t.List[bytes]: return messages def clear(self) -> None: - """Create an empty file for events""" + """Create an empty file for events.""" if self._file_path.exists(): self._file_path.unlink() self._file_path.touch() @@ -110,18 +110,16 @@ def clear(self) -> None: @classmethod def from_descriptor( cls, - descriptor: t.Union[str, bytes], + descriptor: str, ) -> "FileSystemCommChannel": - """A factory method that creates an instance from a descriptor string + """A factory method that creates an instance from a descriptor string. :param descriptor: The descriptor that uniquely identifies the resource - :returns: An attached FileSystemCommChannel""" + :returns: An attached FileSystemCommChannel + """ try: - if isinstance(descriptor, str): - path = pathlib.Path(descriptor) - else: - path = pathlib.Path(descriptor.decode("utf-8")) + path = pathlib.Path(descriptor) return FileSystemCommChannel(path) except: - logger.warning(f"failed to create fs comm channel: {descriptor!r}") + logger.warning(f"failed to create fs comm channel: {descriptor}") raise diff --git a/tests/dragon/utils/msg_pump.py b/tests/dragon/utils/msg_pump.py new file mode 100644 index 0000000000..8d69e57c63 --- /dev/null +++ b/tests/dragon/utils/msg_pump.py @@ -0,0 +1,225 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io +import logging +import pathlib +import sys +import time +import typing as t + +import pytest + +pytest.importorskip("torch") +pytest.importorskip("dragon") + + +# isort: off +import dragon +import multiprocessing as mp +import torch +import torch.nn as nn + +# isort: on + +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim._core.mli.message_handler import MessageHandler +from smartsim.log import get_logger + +logger = get_logger(__name__, log_level=logging.DEBUG) + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + +try: + mp.set_start_method("dragon") +except Exception: + pass + + +class MiniModel(nn.Module): + def __init__(self): + super().__init__() + + self._name = "mini-model" + self._net = torch.nn.Linear(2, 1) + + def forward(self, input): + return self._net(input) + + @property + def bytes(self) -> bytes: + """Returns the model serialized to a byte stream""" + buffer = io.BytesIO() + scripted = torch.jit.trace(self._net, self.get_batch()) + torch.jit.save(scripted, buffer) + return buffer.getvalue() + + @classmethod + def get_batch(cls) -> "torch.Tensor": + return torch.randn((100, 2), dtype=torch.float32) + + +def load_model() -> bytes: + """Create a simple torch model in memory for testing""" + mini_model = MiniModel() + return mini_model.bytes + + +def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: + """Create a simple torch model and persist to disk for + testing purposes. + + :returns: Path to the model file + """ + # test_path = pathlib.Path(work_dir) + if not model_path.parent.exists(): + model_path.parent.mkdir(parents=True, exist_ok=True) + + model_path.unlink(missing_ok=True) + + model = torch.nn.Linear(2, 1) + torch.save(model, model_path) + + return model_path + + +def _mock_messages( + dispatch_fli_descriptor: str, + fs_descriptor: str, + parent_iteration: int, + callback_descriptor: str, +) -> None: + """Mock event producer for triggering the inference pipeline.""" + model_key = "mini-model" + # mock_message sends 2 messages, so we offset by 2 * (# of iterations in caller) + offset = 2 * parent_iteration + + feature_store = BackboneFeatureStore.from_descriptor(fs_descriptor) + request_dispatcher_queue = DragonFLIChannel.from_descriptor(dispatch_fli_descriptor) + + feature_store[model_key] = load_model() + + for iteration_number in range(2): + logged_iteration = offset + iteration_number + logger.debug(f"Sending mock message {logged_iteration}") + + output_key = f"output-{iteration_number}" + + tensor = ( + (iteration_number + 1) * torch.ones((1, 2), dtype=torch.float32) + ).numpy() + fsd = feature_store.descriptor + + tensor_desc = MessageHandler.build_tensor_descriptor( + "c", "float32", list(tensor.shape) + ) + + message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) + message_model_key = MessageHandler.build_model_key(model_key, fsd) + + request = MessageHandler.build_request( + reply_channel=callback_descriptor, + model=message_model_key, + inputs=[tensor_desc], + outputs=[message_tensor_output_key], + output_descriptors=[], + custom_attributes=None, + ) + + logger.info(f"Sending request {iteration_number} to request_dispatcher_queue") + request_bytes = MessageHandler.serialize_request(request) + + logger.info("Sending msg_envelope") + + # cuid = request_dispatcher_queue._channel.cuid + # logger.info(f"\tInternal cuid: {cuid}") + + # send the header & body together so they arrive together + try: + request_dispatcher_queue.send_multiple([request_bytes, tensor.tobytes()]) + logger.info(f"\tenvelope 0: {request_bytes[:5]}...") + logger.info(f"\tenvelope 1: {tensor.tobytes()[:5]}...") + except Exception as ex: + logger.exception("Unable to send request envelope") + + logger.info("All messages sent") + + # keep the process alive for an extra 15 seconds to let the processor + # have access to the channels before they're destroyed + for _ in range(15): + time.sleep(1) + + +def mock_messages( + dispatch_fli_descriptor: str, + fs_descriptor: str, + parent_iteration: int, + callback_descriptor: str, +) -> int: + """Mock event producer for triggering the inference pipeline. Used + when starting using multiprocessing.""" + logger.info(f"{dispatch_fli_descriptor=}") + logger.info(f"{fs_descriptor=}") + logger.info(f"{parent_iteration=}") + logger.info(f"{callback_descriptor=}") + + try: + return _mock_messages( + dispatch_fli_descriptor, + fs_descriptor, + parent_iteration, + callback_descriptor, + ) + except Exception as ex: + logger.exception() + return 1 + + return 0 + + +if __name__ == "__main__": + import argparse + + args = argparse.ArgumentParser() + + args.add_argument("--dispatch-fli-descriptor", type=str) + args.add_argument("--fs-descriptor", type=str) + args.add_argument("--parent-iteration", type=int) + args.add_argument("--callback-descriptor", type=str) + + args = args.parse_args() + + return_code = mock_messages( + args.dispatch_fli_descriptor, + args.fs_descriptor, + args.parent_iteration, + args.callback_descriptor, + ) + sys.exit(return_code) diff --git a/tests/mli/channel.py b/tests/mli/channel.py index 2348784236..4c46359c2d 100644 --- a/tests/mli/channel.py +++ b/tests/mli/channel.py @@ -39,17 +39,15 @@ class FileSystemCommChannel(CommChannelBase): """Passes messages by writing to a file""" - def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: - """Initialize the FileSystemCommChannel instance + def __init__(self, key: pathlib.Path) -> None: + """Initialize the FileSystemCommChannel instance. - :param key: a path to the root directory of the feature store""" + :param key: a path to the root directory of the feature store + """ self._lock = threading.RLock() - if isinstance(key, pathlib.Path): - super().__init__(key.as_posix().encode("utf-8")) - self._file_path = key - else: - super().__init__(key) - self._file_path = pathlib.Path(key.decode("utf-8")) + + super().__init__(key.as_posix()) + self._file_path = key if not self._file_path.parent.exists(): self._file_path.parent.mkdir(parents=True) @@ -57,10 +55,11 @@ def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: self._file_path.touch() def send(self, value: bytes, timeout: float = 0) -> None: - """Send a message throuh the underlying communication channel + """Send a message throuh the underlying communication channel. + :param value: The value to send :param timeout: maximum time to wait (in seconds) for messages to send - :param value: The value to send""" + """ with self._lock: # write as text so we can add newlines as delimiters with open(self._file_path, "a") as fp: @@ -69,11 +68,12 @@ def send(self, value: bytes, timeout: float = 0) -> None: logger.debug(f"FileSystemCommChannel {self._file_path} sent message") def recv(self, timeout: float = 0) -> t.List[bytes]: - """Receives message(s) through the underlying communication channel + """Receives message(s) through the underlying communication channel. :param timeout: maximum time to wait (in seconds) for messages to arrive :returns: the received message - :raises SmartSimError: if the descriptor points to a missing file""" + :raises SmartSimError: if the descriptor points to a missing file + """ with self._lock: messages: t.List[bytes] = [] if not self._file_path.exists(): @@ -102,7 +102,7 @@ def recv(self, timeout: float = 0) -> t.List[bytes]: return messages def clear(self) -> None: - """Create an empty file for events""" + """Create an empty file for events.""" if self._file_path.exists(): self._file_path.unlink() self._file_path.touch() @@ -110,17 +110,15 @@ def clear(self) -> None: @classmethod def from_descriptor( cls, - descriptor: t.Union[str, bytes], + descriptor: str, ) -> "FileSystemCommChannel": - """A factory method that creates an instance from a descriptor string + """A factory method that creates an instance from a descriptor string. :param descriptor: The descriptor that uniquely identifies the resource - :returns: An attached FileSystemCommChannel""" + :returns: An attached FileSystemCommChannel + """ try: - if isinstance(descriptor, str): - path = pathlib.Path(descriptor) - else: - path = pathlib.Path(descriptor.decode("utf-8")) + path = pathlib.Path(descriptor) return FileSystemCommChannel(path) except: logger.warning(f"failed to create fs comm channel: {descriptor}") diff --git a/tests/mli/test_default_torch_worker.py b/tests/mli/test_default_torch_worker.py deleted file mode 100644 index b2ec6c3dca..0000000000 --- a/tests/mli/test_default_torch_worker.py +++ /dev/null @@ -1,206 +0,0 @@ -# # BSD 2-Clause License -# # -# # Copyright (c) 2021-2024, Hewlett Packard Enterprise -# # All rights reserved. -# # -# # Redistribution and use in source and binary forms, with or without -# # modification, are permitted provided that the following conditions are met: -# # -# # 1. Redistributions of source code must retain the above copyright notice, this -# # list of conditions and the following disclaimer. -# # -# # 2. Redistributions in binary form must reproduce the above copyright notice, -# # this list of conditions and the following disclaimer in the documentation -# # and/or other materials provided with the distribution. -# # -# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# import io -# import pathlib -# import typing as t - -# import pytest -# import torch - -# from smartsim._core.mli.infrastructure.worker.integratedtorchworker import ( -# IntegratedTorchWorker, -# ) -# import smartsim.error as sse -# from smartsim._core.mli.infrastructure import MemoryFeatureStore -# from smartsim._core.mli.infrastructure.worker.worker import ( -# ExecuteResult, -# FetchInputResult, -# FetchModelResult, -# InferenceRequest, -# TransformInputResult, -# LoadModelResult, -# ) -# from smartsim._core.utils import installed_redisai_backends - -# # The tests in this file belong to the group_a group -# pytestmark = pytest.mark.group_b - -# # retrieved from pytest fixtures -# is_dragon = pytest.test_launcher == "dragon" -# torch_available = "torch" in installed_redisai_backends() - - -# @pytest.fixture -# def persist_torch_model(test_dir: str) -> pathlib.Path: -# test_path = pathlib.Path(test_dir) -# model_path = test_path / "basic.pt" - -# model = torch.nn.Linear(2, 1) -# torch.save(model, model_path) - -# return model_path - - -# # def test_deserialize() -> None: -# # """Verify that serialized requests are properly deserialized to -# # and converted to the internal representation used by ML workers""" -# # worker = SampleTorchWorker -# # buffer = io.BytesIO() - -# # exp_model_key = "model-key" -# # msg = InferenceRequest(model_key=exp_model_key) -# # pickle.dump(msg, buffer) - -# # deserialized: InferenceRequest = worker.deserialize(buffer.getvalue()) - -# # assert deserialized.model_key == exp_model_key -# # # assert deserialized.backend == exp_backend - - -# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") -# def test_load_model_from_disk(persist_torch_model: pathlib.Path) -> None: -# """Verify that a model can be loaded using a FileSystemFeatureStore""" -# worker = IntegratedTorchWorker -# request = InferenceRequest(raw_model=persist_torch_model.read_bytes()) - -# fetch_result = FetchModelResult(persist_torch_model.read_bytes()) -# load_result = worker.load_model(request, fetch_result) - -# input = torch.randn(2) -# pred = load_result.model(input) - -# assert pred - - -# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") -# def test_transform_input() -> None: -# """Verify that the default input transform operation is a no-op copy""" -# rows, cols = 1, 4 -# num_values = 7 -# tensors = [torch.randn((rows, cols)) for _ in range(num_values)] - -# request = InferenceRequest() - -# inputs: t.List[bytes] = [] -# for tensor in tensors: -# buffer = io.BytesIO() -# torch.save(tensor, buffer) -# inputs.append(buffer.getvalue()) - -# fetch_result = FetchInputResult(inputs) -# worker = IntegratedTorchWorker -# result = worker.transform_input(request, fetch_result) -# transformed: t.Collection[torch.Tensor] = result.transformed - -# assert len(transformed) == num_values - -# for output, expected in zip(transformed, tensors): -# assert output.shape == expected.shape -# assert output.equal(expected) - -# transformed = list(transformed) - -# original: torch.Tensor = tensors[0] -# assert transformed[0].equal(original) - -# # verify a copy was made -# transformed[0] = 2 * transformed[0] -# assert transformed[0].equal(2 * original) - - -# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") -# def test_execute_model(persist_torch_model: pathlib.Path) -> None: -# """Verify that a model executes corrrectly via the worker""" - -# # put model bytes into memory -# model_name = "test-key" -# feature_store = MemoryFeatureStore() -# feature_store[model_name] = persist_torch_model.read_bytes() - -# worker = IntegratedTorchWorker -# request = InferenceRequest(model_key=model_name) -# fetch_result = FetchModelResult(persist_torch_model.read_bytes()) -# load_result = worker.load_model(request, fetch_result) - -# value = torch.randn(2) -# transform_result = TransformInputResult([value]) - -# execute_result = worker.execute(request, load_result, transform_result) - -# assert execute_result.predictions is not None - - -# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") -# def test_execute_missing_model(persist_torch_model: pathlib.Path) -> None: -# """Verify that a executing a model with an invalid key fails cleanly""" - -# # use key that references an un-set model value -# model_name = "test-key" -# feature_store = MemoryFeatureStore() -# feature_store[model_name] = persist_torch_model.read_bytes() - -# worker = IntegratedTorchWorker -# request = InferenceRequest(input_keys=[model_name]) - -# load_result = LoadModelResult(None) -# transform_result = TransformInputResult( -# [torch.randn(2), torch.randn(2), torch.randn(2)] -# ) - -# with pytest.raises(sse.SmartSimError) as ex: -# worker.execute(request, load_result, transform_result) - -# assert "Model must be loaded" in ex.value.args[0] - - -# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") -# def test_transform_output() -> None: -# """Verify that the default output transform operation is a no-op copy""" -# rows, cols = 1, 4 -# num_values = 7 -# inputs = [torch.randn((rows, cols)) for _ in range(num_values)] -# exp_outputs = [torch.Tensor(tensor) for tensor in inputs] - -# worker = SampleTorchWorker -# request = InferenceRequest() -# exec_result = ExecuteResult(inputs) - -# result = worker.transform_output(request, exec_result) - -# assert len(result.outputs) == num_values - -# for output, expected in zip(result.outputs, exp_outputs): -# assert output.shape == expected.shape -# assert output.equal(expected) - -# transformed = list(result.outputs) - -# # verify a copy was made -# original: torch.Tensor = inputs[0] -# transformed[0] = 2 * transformed[0] - -# assert transformed[0].equal(2 * original) diff --git a/tests/mli/test_service.py b/tests/mli/test_service.py index 617738f949..3635f6ff78 100644 --- a/tests/mli/test_service.py +++ b/tests/mli/test_service.py @@ -27,6 +27,7 @@ import datetime import multiprocessing as mp import pathlib +import time import typing as t from asyncore import loop @@ -47,23 +48,37 @@ class SimpleService(Service): def __init__( self, log: t.List[str], - quit_after: int = 0, + quit_after: int = -1, as_service: bool = False, - cooldown: int = 0, - loop_delay: int = 0, + cooldown: float = 0, + loop_delay: float = 0, + hc_freq: float = -1, + run_for: float = 0, ) -> None: - super().__init__(as_service, cooldown, loop_delay) + super().__init__(as_service, cooldown, loop_delay, hc_freq) self._log = log self._quit_after = quit_after - self.num_iterations = 0 self.num_starts = 0 self.num_shutdowns = 0 + self.num_health_checks = 0 self.num_cooldowns = 0 - self.num_can_shutdown = 0 self.num_delays = 0 + self.num_iterations = 0 + self.num_can_shutdown = 0 + self.run_for = run_for + self.start_time = time.time() - def _on_iteration(self) -> None: - self.num_iterations += 1 + @property + def runtime(self) -> float: + return time.time() - self.start_time + + def _can_shutdown(self) -> bool: + self.num_can_shutdown += 1 + + if self._quit_after > -1 and self.num_iterations >= self._quit_after: + return True + if self.run_for > 0: + return self.runtime >= self.run_for def _on_start(self) -> None: self.num_starts += 1 @@ -71,16 +86,17 @@ def _on_start(self) -> None: def _on_shutdown(self) -> None: self.num_shutdowns += 1 + def _on_health_check(self) -> None: + self.num_health_checks += 1 + def _on_cooldown_elapsed(self) -> None: self.num_cooldowns += 1 def _on_delay(self) -> None: self.num_delays += 1 - def _can_shutdown(self) -> bool: - self.num_can_shutdown += 1 - if self._quit_after == 0: - return True + def _on_iteration(self) -> None: + self.num_iterations += 1 return self.num_iterations >= self._quit_after @@ -134,6 +150,7 @@ def test_service_run_until_can_shutdown(num_iterations: int) -> None: # no matter what, it should always execute the _on_iteration method assert service.num_iterations == 1 else: + # the shutdown check follows on_iteration. there will be one last call assert service.num_iterations == num_iterations assert service.num_starts == 1 @@ -203,3 +220,71 @@ def test_service_delay(delay: int, num_iterations: int) -> None: assert duration_in_seconds <= expected_duration assert service.num_cooldowns == 0 assert service.num_shutdowns == 1 + + +@pytest.mark.parametrize( + "health_check_freq, run_for", + [ + pytest.param(1, 5.5, id="1s freq, 10x"), + pytest.param(5, 10.5, id="5s freq, 2x"), + pytest.param(0.1, 5.1, id="0.1s freq, 50x"), + ], +) +def test_service_health_check_freq(health_check_freq: float, run_for: float) -> None: + """Verify that a the health check frequency is honored + + :param health_check_freq: The desired frequency of the health check + :pram run_for: A fixed duration to allow the service to run + """ + activity_log: t.List[str] = [] + + service = SimpleService( + activity_log, + quit_after=-1, + as_service=True, + cooldown=0, + hc_freq=health_check_freq, + run_for=run_for, + ) + + ts0 = datetime.datetime.now() + service.execute() + ts1 = datetime.datetime.now() + + # the expected duration is the sum of the delay between each iteration + expected_hc_count = run_for // health_check_freq + + # allow some wiggle room for frequency comparison + assert expected_hc_count - 1 <= service.num_health_checks <= expected_hc_count + 1 + + assert service.num_cooldowns == 0 + assert service.num_shutdowns == 1 + + +def test_service_health_check_freq_unbound() -> None: + """Verify that a health check frequency of zero is treated as + "always on" and is called each loop iteration + + :param health_check_freq: The desired frequency of the health check + :pram run_for: A fixed duration to allow the service to run + """ + health_check_freq: float = 0.0 + run_for: float = 5 + + activity_log: t.List[str] = [] + + service = SimpleService( + activity_log, + quit_after=-1, + as_service=True, + cooldown=0, + hc_freq=health_check_freq, + run_for=run_for, + ) + + service.execute() + + # allow some wiggle room for frequency comparison + assert service.num_health_checks == service.num_iterations + assert service.num_cooldowns == 0 + assert service.num_shutdowns == 1 diff --git a/tests/test_dragon_comm_utils.py b/tests/test_dragon_comm_utils.py new file mode 100644 index 0000000000..a6f9c206a4 --- /dev/null +++ b/tests/test_dragon_comm_utils.py @@ -0,0 +1,257 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import base64 +import pathlib +import uuid + +import pytest + +from smartsim.error.errors import SmartSimError + +dragon = pytest.importorskip("dragon") + +# isort: off +import dragon.channels as dch +import dragon.infrastructure.parameters as dp +import dragon.managed_memory as dm +import dragon.fli as fli + +# isort: on + +from smartsim._core.mli.comm.channel import dragon_util +from smartsim.log import get_logger + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon +logger = get_logger(__name__) + + +@pytest.fixture(scope="function") +def the_pool() -> dm.MemoryPool: + """Creates a memory pool.""" + raw_pool_descriptor = dp.this_process.default_pd + descriptor_ = base64.b64decode(raw_pool_descriptor) + + pool = dm.MemoryPool.attach(descriptor_) + return pool + + +@pytest.fixture(scope="function") +def the_channel() -> dch.Channel: + """Creates a Channel attached to the local memory pool.""" + channel = dch.Channel.make_process_local() + return channel + + +@pytest.fixture(scope="function") +def the_fli(the_channel) -> fli.FLInterface: + """Creates an FLI attached to the local memory pool.""" + fli_ = fli.FLInterface(main_ch=the_channel, manager_ch=None) + return fli_ + + +def test_descriptor_to_channel_empty() -> None: + """Verify that `descriptor_to_channel` raises an exception when + provided with an empty descriptor.""" + descriptor = "" + + with pytest.raises(ValueError) as ex: + dragon_util.descriptor_to_channel(descriptor) + + assert "empty" in ex.value.args[0] + + +@pytest.mark.parametrize( + "descriptor", + ["a", "ab", "abc", "x1", pathlib.Path(".").absolute().as_posix()], +) +def test_descriptor_to_channel_b64fail(descriptor: str) -> None: + """Verify that `descriptor_to_channel` raises an exception when + provided with an incorrectly encoded descriptor. + + :param descriptor: A descriptor that is not properly base64 encoded + """ + + with pytest.raises(ValueError) as ex: + dragon_util.descriptor_to_channel(descriptor) + + assert "base64" in ex.value.args[0] + + +@pytest.mark.parametrize( + "descriptor", + [str(uuid.uuid4())], +) +def test_descriptor_to_channel_channel_fail(descriptor: str) -> None: + """Verify that `descriptor_to_channel` raises an exception when a correctly + formatted descriptor that does not describe a real channel is passed. + + :param descriptor: A descriptor that is not properly base64 encoded + """ + + with pytest.raises(SmartSimError) as ex: + dragon_util.descriptor_to_channel(descriptor) + + # ensure we're receiving the right exception + assert "address" in ex.value.args[0] + assert "channel" in ex.value.args[0] + + +def test_descriptor_to_channel_channel_not_available(the_channel: dch.Channel) -> None: + """Verify that `descriptor_to_channel` raises an exception when a channel + is no longer available. + + :param the_channel: A dragon channel + """ + + # get a good descriptor & wipe out the channel so it can't be attached + descriptor = dragon_util.channel_to_descriptor(the_channel) + the_channel.destroy() + + with pytest.raises(SmartSimError) as ex: + dragon_util.descriptor_to_channel(descriptor) + + assert "address" in ex.value.args[0] + + +def test_descriptor_to_channel_happy_path(the_channel: dch.Channel) -> None: + """Verify that `descriptor_to_channel` works as expected when provided + a valid descriptor + + :param the_channel: A dragon channel + """ + + # get a good descriptor + descriptor = dragon_util.channel_to_descriptor(the_channel) + + reattached = dragon_util.descriptor_to_channel(descriptor) + assert reattached + + # and just make sure creation of the descriptor is transitive + assert dragon_util.channel_to_descriptor(reattached) == descriptor + + +def test_descriptor_to_fli_empty() -> None: + """Verify that `descriptor_to_fli` raises an exception when + provided with an empty descriptor.""" + descriptor = "" + + with pytest.raises(ValueError) as ex: + dragon_util.descriptor_to_fli(descriptor) + + assert "empty" in ex.value.args[0] + + +@pytest.mark.parametrize( + "descriptor", + ["a", "ab", "abc", "x1", pathlib.Path(".").absolute().as_posix()], +) +def test_descriptor_to_fli_b64fail(descriptor: str) -> None: + """Verify that `descriptor_to_fli` raises an exception when + provided with an incorrectly encoded descriptor. + + :param descriptor: A descriptor that is not properly base64 encoded + """ + + with pytest.raises(ValueError) as ex: + dragon_util.descriptor_to_fli(descriptor) + + assert "base64" in ex.value.args[0] + + +@pytest.mark.parametrize( + "descriptor", + [str(uuid.uuid4())], +) +def test_descriptor_to_fli_fli_fail(descriptor: str) -> None: + """Verify that `descriptor_to_fli` raises an exception when a correctly + formatted descriptor that does not describe a real FLI is passed. + + :param descriptor: A descriptor that is not properly base64 encoded + """ + + with pytest.raises(SmartSimError) as ex: + dragon_util.descriptor_to_fli(descriptor) + + # ensure we're receiving the right exception + assert "address" in ex.value.args[0] + assert "fli" in ex.value.args[0].lower() + + +def test_descriptor_to_fli_fli_not_available( + the_fli: fli.FLInterface, the_channel: dch.Channel +) -> None: + """Verify that `descriptor_to_fli` raises an exception when a channel + is no longer available. + + :param the_fli: A dragon FLInterface + :param the_channel: A dragon channel + """ + + # get a good descriptor & wipe out the FLI so it can't be attached + descriptor = dragon_util.channel_to_descriptor(the_fli) + the_fli.destroy() + the_channel.destroy() + + with pytest.raises(SmartSimError) as ex: + dragon_util.descriptor_to_fli(descriptor) + + # ensure we're receiving the right exception + assert "address" in ex.value.args[0] + + +def test_descriptor_to_fli_happy_path(the_fli: dch.Channel) -> None: + """Verify that `descriptor_to_fli` works as expected when provided + a valid descriptor + + :param the_fli: A dragon FLInterface + """ + + # get a good descriptor + descriptor = dragon_util.channel_to_descriptor(the_fli) + + reattached = dragon_util.descriptor_to_fli(descriptor) + assert reattached + + # and just make sure creation of the descriptor is transitive + assert dragon_util.channel_to_descriptor(reattached) == descriptor + + +def test_pool_to_descriptor_empty() -> None: + """Verify that `pool_to_descriptor` raises an exception when + provided with a null pool.""" + + with pytest.raises(ValueError) as ex: + dragon_util.pool_to_descriptor(None) + + +def test_pool_to_happy_path(the_pool) -> None: + """Verify that `pool_to_descriptor` creates a descriptor + when supplied with a valid memory pool.""" + + descriptor = dragon_util.pool_to_descriptor(the_pool) + assert descriptor diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py index 7b678239a0..b1d8cd34c9 100644 --- a/tests/test_dragon_installer.py +++ b/tests/test_dragon_installer.py @@ -511,10 +511,18 @@ def test_create_dotenv_existing_dotenv(monkeypatch: pytest.MonkeyPatch, test_dir # ensure file was overwritten and env vars are not duplicated dotenv_content = exp_env_path.read_text(encoding="utf-8") - split_content = dotenv_content.split(var_name) - - # split to confirm env var only appars once - assert len(split_content) == 2 + lines = [ + line for line in dotenv_content.split("\n") if line and not "#" in line + ] + for line in lines: + if line.startswith(var_name): + # make sure the var isn't defined recursively + # DRAGON_BASE_DIR=$DRAGON_BASE_DIR + assert var_name not in line[len(var_name) + 1 :] + else: + # make sure any values reference the original base dir var + if var_name in line: + assert f"${var_name}" in line def test_create_dotenv_format(monkeypatch: pytest.MonkeyPatch, test_dir: str): @@ -532,7 +540,7 @@ def test_create_dotenv_format(monkeypatch: pytest.MonkeyPatch, test_dir: str): content = exp_env_path.read_text(encoding="utf-8") # ensure we have values written, but ignore empty lines - lines = [line for line in content.split("\n") if line] + lines = [line for line in content.split("\n") if line and not "#" in line] assert lines # ensure each line is formatted as key=value diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index 37c46a573b..ea45a2cb71 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -510,7 +510,26 @@ def test_load_env_env_file_created(monkeypatch: pytest.MonkeyPatch, test_dir: st assert loaded_env # confirm .env was parsed as expected by inspecting a key + assert "DRAGON_BASE_DIR" in loaded_env + base_dir = loaded_env["DRAGON_BASE_DIR"] + assert "DRAGON_ROOT_DIR" in loaded_env + assert loaded_env["DRAGON_ROOT_DIR"] == base_dir + + assert "DRAGON_INCLUDE_DIR" in loaded_env + assert loaded_env["DRAGON_INCLUDE_DIR"] == f"{base_dir}/include" + + assert "DRAGON_LIB_DIR" in loaded_env + assert loaded_env["DRAGON_LIB_DIR"] == f"{base_dir}/lib" + + assert "DRAGON_VERSION" in loaded_env + assert loaded_env["DRAGON_VERSION"] == DEFAULT_DRAGON_VERSION + + assert "PATH" in loaded_env + assert loaded_env["PATH"] == f"{base_dir}/bin" + + assert "LD_LIBRARY_PATH" in loaded_env + assert loaded_env["LD_LIBRARY_PATH"] == f"{base_dir}/lib" def test_load_env_cached_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): diff --git a/tests/test_message_handler/test_build_model_key.py b/tests/test_message_handler/test_build_model_key.py index c09c787fcf..6c9b3dc951 100644 --- a/tests/test_message_handler/test_build_model_key.py +++ b/tests/test_message_handler/test_build_model_key.py @@ -38,7 +38,7 @@ def test_build_model_key_successful(): fsd = "mock-feature-store-descriptor" model_key = handler.build_model_key("tensor_key", fsd) assert model_key.key == "tensor_key" - assert model_key.featureStoreDescriptor == fsd + assert model_key.descriptor == fsd def test_build_model_key_unsuccessful(): diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py index 7ede41b50d..a60818f7dd 100644 --- a/tests/test_message_handler/test_request.py +++ b/tests/test_message_handler/test_request.py @@ -101,7 +101,7 @@ "reply_channel, model, input, output, output_descriptors, custom_attributes", [ pytest.param( - b"reply channel", + "reply channel", model_key, [input_key1, input_key2], [output_key1, output_key2], @@ -109,7 +109,7 @@ torch_attributes, ), pytest.param( - b"another reply channel", + "another reply channel", model, [input_key1], [output_key2], @@ -117,7 +117,7 @@ tf_attributes, ), pytest.param( - b"another reply channel", + "another reply channel", model, [input_key1], [output_key2], @@ -125,7 +125,7 @@ torch_attributes, ), pytest.param( - b"reply channel", + "reply channel", model_key, [input_key1], [output_key1], @@ -185,7 +185,7 @@ def test_build_request_indirect_successful( id="bad channel", ), pytest.param( - b"reply channel", + "reply channel", "bad model", [input_key1], [output_key2], @@ -194,7 +194,7 @@ def test_build_request_indirect_successful( id="bad model", ), pytest.param( - b"reply channel", + "reply channel", model_key, ["input_key1", "input_key2"], [output_key1, output_key2], @@ -212,7 +212,7 @@ def test_build_request_indirect_successful( id="bad input schema type", ), pytest.param( - b"reply channel", + "reply channel", model_key, [input_key1], ["output_key1", "output_key2"], @@ -230,7 +230,7 @@ def test_build_request_indirect_successful( id="bad output schema type", ), pytest.param( - b"reply channel", + "reply channel", model_key, [input_key1], [output_key1, output_key2], @@ -239,7 +239,7 @@ def test_build_request_indirect_successful( id="bad custom attributes", ), pytest.param( - b"reply channel", + "reply channel", model_key, [input_key1], [output_key1, output_key2], @@ -248,7 +248,7 @@ def test_build_request_indirect_successful( id="bad custom attributes schema type", ), pytest.param( - b"reply channel", + "reply channel", model_key, [input_key1], [output_key1, output_key2], @@ -276,7 +276,7 @@ def test_build_request_indirect_unsuccessful( "reply_channel, model, input, output, output_descriptors, custom_attributes", [ pytest.param( - b"reply channel", + "reply channel", model_key, [tensor_1, tensor_2], [], @@ -284,7 +284,7 @@ def test_build_request_indirect_unsuccessful( torch_attributes, ), pytest.param( - b"another reply channel", + "another reply channel", model, [tensor_1], [], @@ -292,7 +292,7 @@ def test_build_request_indirect_unsuccessful( tf_attributes, ), pytest.param( - b"another reply channel", + "another reply channel", model, [tensor_2], [], @@ -300,7 +300,7 @@ def test_build_request_indirect_unsuccessful( tf_attributes, ), pytest.param( - b"another reply channel", + "another reply channel", model, [tensor_1], [], From dc31b75a922e60428ac37246d5192ade8fb401e2 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 11 Oct 2024 08:22:38 -0500 Subject: [PATCH 53/60] Refine try-catch in onnx worker --- smartsim/_core/_install/buildenv.py | 1 + .../mli/infrastructure/worker/onnx_worker.py | 49 +++++++++---------- .../mli/infrastructure/worker/torch_worker.py | 1 - 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py index a066ab16ac..8d28155d72 100644 --- a/smartsim/_core/_install/buildenv.py +++ b/smartsim/_core/_install/buildenv.py @@ -178,6 +178,7 @@ class RedisAIVersion(Version_): "onnx": "1.14.1", "skl2onnx": "1.16.0", "onnxmltools": "1.12.0", + "onnxruntime": "1.19.2", "scikit-learn": "1.3.2", "torch": "2.0.1", "torch_cpu_suffix": "+cpu", diff --git a/smartsim/_core/mli/infrastructure/worker/onnx_worker.py b/smartsim/_core/mli/infrastructure/worker/onnx_worker.py index b22917814c..47a9b13bff 100644 --- a/smartsim/_core/mli/infrastructure/worker/onnx_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/onnx_worker.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import logging import os import numpy as np @@ -81,36 +80,36 @@ def load_model( else: raise ValueError("Unable to load model without reference object") + providers = [] + provider_options = [] + + if "gpu" in device.lower(): + device_split = device.split(":") + if len(device_split) > 1: + provider_options.append({"device_id": device_split[-1]}) + else: + provider_options.append({}) + if "ROCR_VISIBLE_DEVICES" in os.environ: + providers = ["ROCMExecutionProvider"] + else: + providers = ["CUDAExecutionProvider"] + + # Fallback + providers.append("CPUExecutionProvider") + provider_options.append({}) + try: - providers = [] - provider_options = [] - if "gpu" in device.lower(): - device_split = device.split(":") - if len(device_split) > 1: - provider_options.append({"device_id": device_split[-1]}) - else: - provider_options.append({}) - if "ROCR_VISIBLE_DEVICES" in os.environ: - providers = ["ROCMExecutionProvider"] - else: - providers = ["CUDAExecutionProvider"] - - # Fallback - providers.append("CPUExecutionProvider") - provider_options.append({}) + onnx_deserialized = load_model_from_string(model_bytes) + output_tensors = [n.name for n in onnx_deserialized.graph.output] + input_layers = [n.name for n in onnx_deserialized.graph.input] + session = InferenceSession( + model_bytes, providers=providers, provider_options=provider_options + ) except Exception as e: raise RuntimeError( "Failed to load and evaluate the model: " f"Model key {batch.model_id.key}, Device {device}" ) from e - - onnx_deserialized = load_model_from_string(model_bytes) - output_tensors = [n.name for n in onnx_deserialized.graph.output] - input_layers = [n.name for n in onnx_deserialized.graph.input] - - session = InferenceSession( - model_bytes, providers=providers, provider_options=provider_options - ) result = LoadModelResult( session, input_layers, diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 8eec7bcc11..298326a42a 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -97,7 +97,6 @@ def load_model( except Exception as exc: logger.info("Could not compile Torch model, original exception: ") logger.info(exc) - pass result = LoadModelResult(model) return result From 4e6ddffe46caccd816398f430177bd474003d568 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 11 Oct 2024 13:00:57 -0500 Subject: [PATCH 54/60] Use new ProtoClient in apps --- ex/high_throughput_inference/mli_driver.py | 2 +- ex/high_throughput_inference/mock_app.py | 42 +++--- ex/high_throughput_inference/mock_app_onnx.py | 135 ++++-------------- .../mock_app_tensorflow.py | 135 ++++-------------- .../standalone_worker_manager.py | 2 + .../_core/launcher/dragon/dragonBackend.py | 7 - smartsim/_core/mli/client/protoclient.py | 118 +++------------ .../_core/mli/comm/channel/dragon_channel.py | 2 +- smartsim/_core/mli/comm/channel/dragon_fli.py | 8 +- 9 files changed, 90 insertions(+), 361 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 239db3107c..b53cb0b9cc 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -19,7 +19,7 @@ args = parser.parse_args() DEVICE = "gpu" -NUM_RANKS_PER_NODE = 16 +NUM_RANKS_PER_NODE = 1 NUM_NODES_APP = args.num_nodes_app NUM_WORKERS = 1 BATCH_SIZE = 2 diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 2448b30687..2564918450 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -24,23 +24,14 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# isort: off -import dragon -from dragon import fli -from dragon.channels import Channel -import dragon.channels -from dragon.data.ddict.ddict import DDict -from dragon.globalservices.api_setup import connect_to_infrastructure -from dragon.utils import b64decode, b64encode - -# isort: on import argparse import io - +from mpi4py import MPI import torch from smartsim.log import get_logger +from smartsim._core.mli.client.protoclient import ProtoClient torch.set_num_interop_threads(16) torch.set_num_threads(1) @@ -48,18 +39,13 @@ logger = get_logger("App") logger.info("Started app") -from collections import OrderedDict - -from smartsim.log import get_logger, log_to_file -from smartsim._core.mli.client.protoclient import ProtoClient logger = get_logger("App") - - class ResNetWrapper: """Wrapper around a pre-rained ResNet model.""" + def __init__(self, name: str, model: str): """Initialize the instance. @@ -72,6 +58,7 @@ def __init__(self, name: str, model: str): buffer = io.BytesIO(model_file.read()) self._serialized_model = buffer.getvalue() + # pylint: disable-next=no-self-use def get_batch(self, batch_size: int = 32): """Create a random batch of data with the correct dimensions to invoke a ResNet model. @@ -95,8 +82,8 @@ def name(self) -> str: return self._name -def log(msg: str, rank: int) -> None: - if rank == 0: +def log(msg: str, rank_: int) -> None: + if rank_ == 0: logger.info(msg) @@ -109,21 +96,24 @@ def log(msg: str, rank: int) -> None: resnet = ResNetWrapper("resnet50", f"resnet50.{args.device}.pt") - client = ProtoClient(timing_on=True) - if client._rank == 0: - client.set_model(resnet.name, resnet.model) + comm_world = MPI.COMM_WORLD + rank = comm_world.Get_rank() + client = ProtoClient(timing_on=True, rank=rank) + if rank == 0: + client.set_model(resnet.name, resnet.model) + comm_world.Barrier() TOTAL_ITERATIONS = 100 for log2_bsize in range(args.log_max_batchsize, args.log_max_batchsize + 1): b_size: int = 2**log2_bsize - log(f"Batch size: {b_size}", client._rank) + log(f"Batch size: {b_size}", rank) for iteration_number in range(TOTAL_ITERATIONS): - sample_batch = resnet.get_batch(b_size) + sample_batch = resnet.get_batch(b_size).numpy() remote_result = client.run_model(resnet.name, sample_batch) + comm_world.Barrier() logger.info(client.perf_timer.get_last("total_time")) - - client.perf_timer.print_timings(to_file=True, to_stdout=client._rank == 0) + client.perf_timer.print_timings(to_file=True, to_stdout=rank == 0) diff --git a/ex/high_throughput_inference/mock_app_onnx.py b/ex/high_throughput_inference/mock_app_onnx.py index dfa93937ac..2ae5e0dbdb 100644 --- a/ex/high_throughput_inference/mock_app_onnx.py +++ b/ex/high_throughput_inference/mock_app_onnx.py @@ -24,23 +24,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# isort: off -import dragon -from dragon import fli -from dragon.channels import Channel -import dragon.channels -from dragon.data.ddict.ddict import DDict -from dragon.globalservices.api_setup import connect_to_infrastructure -from dragon.utils import b64decode, b64encode - -# isort: on import argparse -import io -import os -import time -import typing as t -import warnings from mpi4py import MPI import numpy @@ -52,97 +37,12 @@ from skl2onnx import to_onnx -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( - DragonFeatureStore, -) -from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel, create_local -from smartsim._core.mli.message_handler import MessageHandler -from smartsim._core.utils.timings import PerfTimer from smartsim.log import get_logger +from smartsim._core.mli.client.protoclient import ProtoClient logger = get_logger("App") -class ProtoClient: - def __init__(self, timing_on: bool): - self._comm = MPI.COMM_WORLD - self._rank = self._comm.Get_rank() - connect_to_infrastructure() - ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] - self._ddict = DDict.attach(ddict_str) - self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor - to_worker_fli_str = None - while to_worker_fli_str is None: - try: - to_worker_fli_str = self._ddict["to_worker_fli"] - self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str) - except KeyError: - time.sleep(1) - self._from_worker_ch = DragonCommChannel(Channel.make_process_local()) - self._from_worker_ch_serialized = self._from_worker_ch.descriptor_string - self._to_worker_ch = Channel.make_process_local() - - self.perf_timer: PerfTimer = PerfTimer( - debug=False, timing_on=timing_on, prefix=f"a{self._rank}_" - ) - self._num_its: int = 0 - - def run_model(self, model: t.Union[bytes, str], batch: numpy.typing.ArrayLike): - tensors = [batch] - self.perf_timer.start_timings("batch_size", batch.shape[0]) - built_tensor_desc = MessageHandler.build_tensor_descriptor( - "c", "float32", list(batch.shape) - ) - self.perf_timer.measure_time("build_tensor_descriptor") - if isinstance(model, str): - model_arg = MessageHandler.build_model_key(model, self._backbone_descriptor) - else: - model_arg = MessageHandler.build_model(model, "lin_reg", "1.0") - request = MessageHandler.build_request( - reply_channel=self._from_worker_ch_serialized, - model=model_arg, - inputs=[built_tensor_desc], - outputs=[], - output_descriptors=[], - custom_attributes=None, - ) - self.perf_timer.measure_time("build_request") - request_bytes = MessageHandler.serialize_request(request) - self.perf_timer.measure_time("serialize_request") - with self._to_worker_fli.sendh( - timeout=None, stream_channel=self._to_worker_ch - ) as to_sendh: - to_sendh.send_bytes(request_bytes) - self.perf_timer.measure_time("send_request") - for tensor in tensors: - to_sendh.send_bytes(tensor.tobytes()) - self.perf_timer.measure_time("send_tensors") - resp = self._from_worker_ch.recv(timeout=None) - self.perf_timer.measure_time("receive_response") - response = MessageHandler.deserialize_response(resp[0]) - self.perf_timer.measure_time("deserialize_response") - # list of data blobs? recv depending on the len(response.result.descriptors)? - if len(resp) > 1: - data_blob = resp[1] - else: - data_blob: bytes = self._from_worker_ch.recv(timeout=None)[0] - self.perf_timer.measure_time("receive_tensor") - result = numpy.frombuffer( - data_blob, - dtype=str(response.result.descriptors[0].dataType), - ) - - self.perf_timer.measure_time("deserialize_tensor") - - self.perf_timer.end_timings() - self._num_its += 1 - self._comm.Barrier() - return result - - def set_model(self, key: str, model: bytes): - self._ddict[key] = model - - class LinRegWrapper: def __init__( self, @@ -156,21 +56,33 @@ def __init__( def _get_onnx_model(self, model: onnx.onnx_ml_pb2.ModelProto): self._serialized_model = model.SerializeToString() + # pylint: disable-next=no-self-use def get_batch(self, batch_size: int = 32): + """Create a random batch of data with the correct dimensions to + invoke a ResNet model. + + :param batch_size: The desired number of samples to produce + :returns: A PyTorch tensor""" x = numpy.random.randn(batch_size, 1).astype(numpy.float32) - return poly.fit_transform(x.reshape(-1,1)) + return poly.fit_transform(x.reshape(-1, 1)) @property def model(self): + """The content of a model file. + + :returns: The model bytes""" return self._serialized_model @property def name(self): + """The name applied to the model. + + :returns: The name""" return self._name -def log(msg: str, rank: int) -> None: - if rank == 0: +def log(msg: str, rank_: int) -> None: + if rank_ == 0: logger.info(msg) @@ -192,9 +104,11 @@ def log(msg: str, rank: int) -> None: linreg = LinRegWrapper("LinReg", onnx_model) - client = ProtoClient(timing_on=True) + comm_world = MPI.COMM_WORLD + rank = comm_world.Get_rank() + client = ProtoClient(timing_on=True, rank=rank) - if client._rank == 0: + if rank == 0: client.set_model(linreg.name, linreg.model) MPI.COMM_WORLD.Barrier() @@ -203,13 +117,14 @@ def log(msg: str, rank: int) -> None: for log2_bsize in range(args.log_max_batchsize, args.log_max_batchsize + 1): b_size: int = 2**log2_bsize - log(f"Batch size: {b_size}", client._rank) + log(f"Batch size: {b_size}", rank) for iteration_number in range(TOTAL_ITERATIONS): sample_batch = linreg.get_batch(b_size) remote_result = client.run_model(linreg.name, sample_batch) log( - f"Completed iteration: {iteration_number} in {client.perf_timer.get_last('total_time')} seconds", - client._rank, + f"Completed iteration: {iteration_number} " + f"in {client.perf_timer.get_last('total_time')} seconds", + rank, ) - client.perf_timer.print_timings(to_file=True, to_stdout=client._rank == 0) + client.perf_timer.print_timings(to_file=True, to_stdout=rank == 0) diff --git a/ex/high_throughput_inference/mock_app_tensorflow.py b/ex/high_throughput_inference/mock_app_tensorflow.py index 0ecd5bb17d..704d51ee48 100644 --- a/ex/high_throughput_inference/mock_app_tensorflow.py +++ b/ex/high_throughput_inference/mock_app_tensorflow.py @@ -24,23 +24,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# isort: off -import dragon -from dragon import fli -from dragon.channels import Channel -import dragon.channels -from dragon.data.ddict.ddict import DDict -from dragon.globalservices.api_setup import connect_to_infrastructure -from dragon.utils import b64decode, b64encode - -# isort: on import argparse -import io -import os -import time -import typing as t -import warnings from mpi4py import MPI import numpy @@ -49,97 +34,12 @@ convert_variables_to_constants_v2_as_graph, ) -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( - DragonFeatureStore, -) -from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel, create_local -from smartsim._core.mli.message_handler import MessageHandler -from smartsim._core.utils.timings import PerfTimer from smartsim.log import get_logger +from smartsim._core.mli.client.protoclient import ProtoClient logger = get_logger("App") -class ProtoClient: - def __init__(self, timing_on: bool): - self._comm = MPI.COMM_WORLD - self._rank = self._comm.Get_rank() - connect_to_infrastructure() - ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] - self._ddict = DDict.attach(ddict_str) - self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor - to_worker_fli_str = None - while to_worker_fli_str is None: - try: - to_worker_fli_str = self._ddict["to_worker_fli"] - self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str) - except KeyError: - time.sleep(1) - self._from_worker_ch = DragonCommChannel(Channel.make_process_local()) - self._from_worker_ch_serialized = self._from_worker_ch.descriptor_string - self._to_worker_ch = Channel.make_process_local() - - self.perf_timer: PerfTimer = PerfTimer( - debug=False, timing_on=timing_on, prefix=f"a{self._rank}_" - ) - self._num_its: int = 0 - - def run_model(self, model: t.Union[bytes, str], batch: numpy.typing.ArrayLike): - tensors = [batch] - self.perf_timer.start_timings("batch_size", batch.shape[0]) - built_tensor_desc = MessageHandler.build_tensor_descriptor( - "c", "float32", list(batch.shape) - ) - self.perf_timer.measure_time("build_tensor_descriptor") - if isinstance(model, str): - model_arg = MessageHandler.build_model_key(model, self._backbone_descriptor) - else: - model_arg = MessageHandler.build_model(model, "resnet-50", "1.0") - request = MessageHandler.build_request( - reply_channel=self._from_worker_ch_serialized, - model=model_arg, - inputs=[built_tensor_desc], - outputs=[], - output_descriptors=[], - custom_attributes=None, - ) - self.perf_timer.measure_time("build_request") - request_bytes = MessageHandler.serialize_request(request) - self.perf_timer.measure_time("serialize_request") - with self._to_worker_fli.sendh( - timeout=None, stream_channel=self._to_worker_ch - ) as to_sendh: - to_sendh.send_bytes(request_bytes) - self.perf_timer.measure_time("send_request") - for tensor in tensors: - to_sendh.send_bytes(tensor.tobytes()) - self.perf_timer.measure_time("send_tensors") - resp = self._from_worker_ch.recv(timeout=None) - self.perf_timer.measure_time("receive_response") - response = MessageHandler.deserialize_response(resp[0]) - self.perf_timer.measure_time("deserialize_response") - # list of data blobs? recv depending on the len(response.result.descriptors)? - if len(resp) > 1: - data_blob = resp[1] - else: - data_blob: bytes = self._from_worker_ch.recv(timeout=None)[0] - self.perf_timer.measure_time("receive_tensor") - result = numpy.frombuffer( - data_blob, - dtype=str(response.result.descriptors[0].dataType), - ) - - self.perf_timer.measure_time("deserialize_tensor") - - self.perf_timer.end_timings() - self._num_its += 1 - self._comm.Barrier() - return result - - def set_model(self, key: str, model: bytes): - self._ddict[key] = model - - class ResNetWrapper: def __init__( self, @@ -156,20 +56,32 @@ def _get_tf_model(self, model: tf.keras.Model): _, graph_def = convert_variables_to_constants_v2_as_graph(real_model) self._serialized_model = graph_def.SerializeToString() + # pylint: disable-next=no-self-use def get_batch(self, batch_size: int = 32): + """Create a random batch of data with the correct dimensions to + invoke a ResNet model. + + :param batch_size: The desired number of samples to produce + :returns: A PyTorch tensor""" return numpy.random.randn(batch_size, 224, 224, 3).astype(numpy.float32) @property def model(self): + """The content of a model file. + + :returns: The model bytes""" return self._serialized_model @property def name(self): + """The name applied to the model. + + :returns: The name""" return self._name -def log(msg: str, rank: int) -> None: - if rank == 0: +def log(msg: str, rank_: int) -> None: + if rank_ == 0: logger.info(msg) @@ -182,24 +94,27 @@ def log(msg: str, rank: int) -> None: resnet = ResNetWrapper("resnet50", tf.keras.applications.ResNet50()) - client = ProtoClient(timing_on=True) + comm_world = MPI.COMM_WORLD + rank = comm_world.Get_rank() + client = ProtoClient(timing_on=True, rank=rank) - if client._rank == 0: + if rank == 0: client.set_model(resnet.name, resnet.model) - MPI.COMM_WORLD.Barrier() + comm_world.Barrier() TOTAL_ITERATIONS = 100 for log2_bsize in range(args.log_max_batchsize, args.log_max_batchsize + 1): b_size: int = 2**log2_bsize - log(f"Batch size: {b_size}", client._rank) + log(f"Batch size: {b_size}", rank) for iteration_number in range(TOTAL_ITERATIONS): sample_batch = resnet.get_batch(b_size) remote_result = client.run_model(resnet.name, sample_batch) log( - f"Completed iteration: {iteration_number} in {client.perf_timer.get_last('total_time')} seconds", - client._rank, + f"Completed iteration: {iteration_number} in " + f"{client.perf_timer.get_last('total_time')} seconds", + rank, ) - client.perf_timer.print_timings(to_file=True, to_stdout=client._rank == 0) + client.perf_timer.print_timings(to_file=True, to_stdout=rank == 0) diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py index 7857af9922..afaf996780 100644 --- a/ex/high_throughput_inference/standalone_worker_manager.py +++ b/ex/high_throughput_inference/standalone_worker_manager.py @@ -140,7 +140,9 @@ def service_as_dragon_proc( to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli) + backbone._allow_reserved_writes = True backbone.worker_queue = to_worker_fli_comm_ch.descriptor + backbone._allow_reserved_writes = False os.environ[BackboneFeatureStore.MLI_WORKER_QUEUE] = to_worker_fli_comm_ch.descriptor os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone.descriptor diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 036fbbc52b..17438f4bbf 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -948,13 +948,6 @@ def _(self, request: DragonShutdownRequest) -> DragonShutdownResponse: self._frontend_shutdown = request.frontend_shutdown return DragonShutdownResponse() - def __del__(self) -> None: - try: - if self._infra_ddict is not None: - self._infra_ddict.destroy() - except Exception: - logger.error("Could not destroy Backbone dictionary") - class DragonBackendView: def __init__(self, backend: DragonBackend) -> None: diff --git a/smartsim/_core/mli/client/protoclient.py b/smartsim/_core/mli/client/protoclient.py index 46598a8171..7056d498fd 100644 --- a/smartsim/_core/mli/client/protoclient.py +++ b/smartsim/_core/mli/client/protoclient.py @@ -30,11 +30,6 @@ import dragon.channels from dragon.globalservices.api_setup import connect_to_infrastructure -try: - from mpi4py import MPI # type: ignore[import-not-found] -except Exception: - MPI = None - print("Unable to import `mpi4py` package") # isort: on # pylint: enable=unused-import,import-error @@ -46,7 +41,7 @@ from collections import OrderedDict import numpy -import torch +import numpy.typing as npt from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel @@ -82,7 +77,7 @@ class ProtoClient: triggering QueueFull exceptions.""" _EVENT_SOURCE = "proto-client" - """A user-friendly name for this class instance to identify + """A user-friendly name for this class instance to identify the client as the publisher of an event.""" @staticmethod @@ -131,7 +126,9 @@ def _attach_to_worker_queue(self) -> DragonFLIChannel: ) raise SmartSimError("Unable to locate worker queue using backbone") from ex - return DragonFLIChannel.from_descriptor(descriptor) + fli_channel = DragonFLIChannel.from_descriptor(descriptor) + + return fli_channel def _create_broadcaster(self) -> EventBroadcaster: """Create an EventBroadcaster that broadcasts events to @@ -148,6 +145,7 @@ def __init__( self, timing_on: bool, backbone_timeout: float = _DEFAULT_BACKBONE_TIMEOUT, + rank: int = 0 ) -> None: """Initialize the client instance. @@ -158,13 +156,7 @@ def __init__( :raises SmartSimError: If unable to attach to a backbone featurestore :raises ValueError: If an invalid backbone timeout is specified """ - if MPI is not None: - # TODO: determine a way to make MPI work in the test environment - # - consider catching the import exception and defaulting rank to 0 - comm = MPI.COMM_WORLD - rank: int = comm.Get_rank() - else: - rank = 0 + self._rank = rank if backbone_timeout <= 0: raise ValueError( @@ -179,18 +171,14 @@ def __init__( self._backbone.wait_timeout = self.backbone_timeout self._to_worker_fli = self._attach_to_worker_queue() - self._from_worker_ch = create_local(self._DEFAULT_WORK_QUEUE_SIZE) + self._from_worker_ch = DragonCommChannel(create_local(self._DEFAULT_WORK_QUEUE_SIZE)) self._to_worker_ch = create_local(self._DEFAULT_WORK_QUEUE_SIZE) self._publisher = self._create_broadcaster() self.perf_timer: PerfTimer = PerfTimer( - debug=False, timing_on=timing_on, prefix=f"a{rank}_" + debug=False, timing_on=timing_on, prefix=f"a{self._rank}_" ) - self._start: t.Optional[float] = None - self._interm: t.Optional[float] = None - self._timings: _TimingDict = OrderedDict() - self._timing_on = timing_on @property def backbone_timeout(self) -> float: @@ -200,83 +188,16 @@ def backbone_timeout(self) -> float: :returns: A float indicating the number of seconds to allow""" return self._backbone_timeout - def _add_label_to_timings(self, label: str) -> None: - """Adds a new label into the timing dictionary to prepare for - receiving timing events. - - :param label: The label to create storage for - """ - if label not in self._timings: - self._timings[label] = [] - - @staticmethod - def _format_number(number: t.Union[numbers.Number, float]) -> str: - """Utility function for formatting numbers consistently for logs. - - :param number: The number to convert to a formatted string - :returns: The formatted string containing the number - """ - return f"{number:0.4e}" - - def start_timings(self, batch_size: numbers.Number) -> None: - """Configure the client to begin storing timing information. - - :param batch_size: The size of batches to generate as inputs - to the model - """ - if self._timing_on: - self._add_label_to_timings("batch_size") - self._timings["batch_size"].append(self._format_number(batch_size)) - self._start = time.perf_counter() - self._interm = time.perf_counter() - - def end_timings(self) -> None: - """Configure the client to stop storing timing information.""" - if self._timing_on and self._start is not None: - self._add_label_to_timings("total_time") - self._timings["total_time"].append( - self._format_number(time.perf_counter() - self._start) - ) - - def measure_time(self, label: str) -> None: - """Measures elapsed time since the last recorded signal. - - :param label: The label to measure time for - """ - if self._timing_on and self._interm is not None: - self._add_label_to_timings(label) - self._timings[label].append( - self._format_number(time.perf_counter() - self._interm) - ) - self._interm = time.perf_counter() - - def print_timings(self, to_file: bool = False) -> None: - """Print timing information to standard output. If `to_file` - is `True`, also write results to a file. - - :param to_file: If `True`, also saves timing information - to the files `timings.npy` and `timings.txt` - """ - print(" ".join(self._timings.keys())) - - value_array = numpy.array(self._timings.values(), dtype=float) - value_array = numpy.transpose(value_array) - for i in range(value_array.shape[0]): - print(" ".join(self._format_number(value) for value in value_array[i])) - if to_file: - numpy.save("timings.npy", value_array) - numpy.savetxt("timings.txt", value_array) - - def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any: + def run_model(self, model: t.Union[bytes, str], batch: npt.ArrayLike) -> t.Any: """Execute a batch of inference requests with the supplied ML model. - :param model: The raw bytes or path to a pytorch model + :param model: The raw bytes or path to a model :param batch: The tensor batch to perform inference on :returns: The inference results :raises ValueError: if the worker queue is not configured properly in the environment variables """ - tensors = [batch.numpy()] + tensors = [batch] self.perf_timer.start_timings("batch_size", batch.shape[0]) built_tensor_desc = MessageHandler.build_tensor_descriptor( "c", "float32", list(batch.shape) @@ -304,14 +225,7 @@ def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any: raise ValueError("No worker queue available.") # pylint: disable-next=protected-access - with self._to_worker_fli._channel.sendh( # type: ignore - timeout=None, - stream_channel=self._to_worker_ch.channel, - ) as to_sendh: - to_sendh.send_bytes(request_bytes) - self.perf_timer.measure_time("send_request") - for tensor in tensors: - to_sendh.send_bytes(tensor.tobytes()) # TODO NOT FAST ENOUGH!!! + self._to_worker_fli.send_multiple([request_bytes, *[tensor.tobytes() for tensor in tensors]], timeout=None) logger.info(f"Message size: {len(request_bytes)} bytes") self.perf_timer.measure_time("send_tensors") @@ -324,15 +238,15 @@ def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any: # recv depending on the len(response.result.descriptors)? data_blob: bytes = from_recvh.recv_bytes(timeout=None) self.perf_timer.measure_time("receive_tensor") - result = torch.from_numpy( - numpy.frombuffer( + result = numpy.frombuffer( data_blob, dtype=str(response.result.descriptors[0].dataType), ) - ) + self.perf_timer.measure_time("deserialize_tensor") self.perf_timer.end_timings() + return result def set_model(self, key: str, model: bytes) -> None: diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index 60a960decd..1d777681de 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -66,7 +66,7 @@ def send(self, value: bytes, timeout: t.Optional[float] = 0.001) -> None: """ try: with self._channel.sendh(timeout=timeout) as sendh: - sendh.send_bytes(value, blocking=False) + sendh.send_bytes(value, timeout=None) logger.debug(f"DragonCommChannel {self.descriptor} sent message") except Exception as e: raise SmartSimError( diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index b7ad1cda9d..5fceeb76c2 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -77,8 +77,8 @@ def send(self, value: bytes, timeout: t.Optional[float] = 0.001) -> None: if self._channel is None: self._channel = drg_util.create_local(self._buffer_size) - with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: - sendh.send_bytes(value, timeout=timeout) + with self._fli.sendh(timeout=timeout, stream_channel=self._channel) as sendh: + sendh.send_bytes(value, timeout=None) logger.debug(f"DragonFLIChannel {self.descriptor} sent message") except Exception as e: self._channel = None @@ -101,9 +101,9 @@ def send_multiple( if self._channel is None: self._channel = drg_util.create_local(self._buffer_size) - with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: + with self._fli.sendh(timeout=timeout, stream_channel=self._channel) as sendh: for value in values: - sendh.send_bytes(value) + sendh.send_bytes(value, timeout=None) logger.debug(f"DragonFLIChannel {self.descriptor} sent message") except Exception as e: self._channel = None From 9947013c11b7e596a29bfc13678ff31715140a51 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 11 Oct 2024 13:59:56 -0500 Subject: [PATCH 55/60] Style --- smartsim/_core/mli/client/protoclient.py | 18 ++++++++++-------- smartsim/_core/mli/comm/channel/dragon_fli.py | 8 ++++++-- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/smartsim/_core/mli/client/protoclient.py b/smartsim/_core/mli/client/protoclient.py index 7056d498fd..41ec8f700e 100644 --- a/smartsim/_core/mli/client/protoclient.py +++ b/smartsim/_core/mli/client/protoclient.py @@ -34,9 +34,7 @@ # isort: on # pylint: enable=unused-import,import-error -import numbers import os -import time import typing as t from collections import OrderedDict @@ -145,7 +143,7 @@ def __init__( self, timing_on: bool, backbone_timeout: float = _DEFAULT_BACKBONE_TIMEOUT, - rank: int = 0 + rank: int = 0, ) -> None: """Initialize the client instance. @@ -171,7 +169,9 @@ def __init__( self._backbone.wait_timeout = self.backbone_timeout self._to_worker_fli = self._attach_to_worker_queue() - self._from_worker_ch = DragonCommChannel(create_local(self._DEFAULT_WORK_QUEUE_SIZE)) + self._from_worker_ch = DragonCommChannel( + create_local(self._DEFAULT_WORK_QUEUE_SIZE) + ) self._to_worker_ch = create_local(self._DEFAULT_WORK_QUEUE_SIZE) self._publisher = self._create_broadcaster() @@ -225,7 +225,9 @@ def run_model(self, model: t.Union[bytes, str], batch: npt.ArrayLike) -> t.Any: raise ValueError("No worker queue available.") # pylint: disable-next=protected-access - self._to_worker_fli.send_multiple([request_bytes, *[tensor.tobytes() for tensor in tensors]], timeout=None) + self._to_worker_fli.send_multiple( + [request_bytes, *[tensor.tobytes() for tensor in tensors]], timeout=None + ) logger.info(f"Message size: {len(request_bytes)} bytes") self.perf_timer.measure_time("send_tensors") @@ -239,9 +241,9 @@ def run_model(self, model: t.Union[bytes, str], batch: npt.ArrayLike) -> t.Any: data_blob: bytes = from_recvh.recv_bytes(timeout=None) self.perf_timer.measure_time("receive_tensor") result = numpy.frombuffer( - data_blob, - dtype=str(response.result.descriptors[0].dataType), - ) + data_blob, + dtype=str(response.result.descriptors[0].dataType), + ) self.perf_timer.measure_time("deserialize_tensor") diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index 5fceeb76c2..7e8741c8a0 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -77,7 +77,9 @@ def send(self, value: bytes, timeout: t.Optional[float] = 0.001) -> None: if self._channel is None: self._channel = drg_util.create_local(self._buffer_size) - with self._fli.sendh(timeout=timeout, stream_channel=self._channel) as sendh: + with self._fli.sendh( + timeout=timeout, stream_channel=self._channel + ) as sendh: sendh.send_bytes(value, timeout=None) logger.debug(f"DragonFLIChannel {self.descriptor} sent message") except Exception as e: @@ -101,7 +103,9 @@ def send_multiple( if self._channel is None: self._channel = drg_util.create_local(self._buffer_size) - with self._fli.sendh(timeout=timeout, stream_channel=self._channel) as sendh: + with self._fli.sendh( + timeout=timeout, stream_channel=self._channel + ) as sendh: for value in values: sendh.send_bytes(value, timeout=None) logger.debug(f"DragonFLIChannel {self.descriptor} sent message") From 23de37f71df21d4b857401c0f8da6c85a9aae60b Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 11 Oct 2024 14:17:50 -0500 Subject: [PATCH 56/60] Mypy --- smartsim/_core/mli/client/protoclient.py | 3 +-- smartsim/_core/mli/comm/channel/dragon_fli.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/mli/client/protoclient.py b/smartsim/_core/mli/client/protoclient.py index 41ec8f700e..82845fad20 100644 --- a/smartsim/_core/mli/client/protoclient.py +++ b/smartsim/_core/mli/client/protoclient.py @@ -39,7 +39,6 @@ from collections import OrderedDict import numpy -import numpy.typing as npt from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel @@ -188,7 +187,7 @@ def backbone_timeout(self) -> float: :returns: A float indicating the number of seconds to allow""" return self._backbone_timeout - def run_model(self, model: t.Union[bytes, str], batch: npt.ArrayLike) -> t.Any: + def run_model(self, model: t.Union[bytes, str], batch: numpy.ndarray[t.Any, t.Any]) -> t.Any: """Execute a batch of inference requests with the supplied ML model. :param model: The raw bytes or path to a model diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index 7e8741c8a0..e99c8bc8ae 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -91,7 +91,7 @@ def send(self, value: bytes, timeout: t.Optional[float] = 0.001) -> None: def send_multiple( self, values: t.Sequence[bytes], - timeout: float = 0.001, + timeout: t.Optional[float] = 0.001, ) -> None: """Send a message through the underlying communication channel. From 10d59c801ca857b1ef715f1a31482984b7f5613b Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 11 Oct 2024 14:31:30 -0500 Subject: [PATCH 57/60] Style --- smartsim/_core/mli/client/protoclient.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/smartsim/_core/mli/client/protoclient.py b/smartsim/_core/mli/client/protoclient.py index 82845fad20..1f01198183 100644 --- a/smartsim/_core/mli/client/protoclient.py +++ b/smartsim/_core/mli/client/protoclient.py @@ -187,7 +187,9 @@ def backbone_timeout(self) -> float: :returns: A float indicating the number of seconds to allow""" return self._backbone_timeout - def run_model(self, model: t.Union[bytes, str], batch: numpy.ndarray[t.Any, t.Any]) -> t.Any: + def run_model( + self, model: t.Union[bytes, str], batch: numpy.ndarray[t.Any, t.Any] + ) -> t.Any: """Execute a batch of inference requests with the supplied ML model. :param model: The raw bytes or path to a model From ce5a306f0a8ee05540ac36830f0fb0f7ab2c59f5 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 14 Oct 2024 10:40:35 -0500 Subject: [PATCH 58/60] Fix tests --- tests/dragon/test_device_manager.py | 4 ++-- tests/dragon/test_dragon_backend.py | 1 + tests/dragon/test_onnx_worker.py | 4 ++-- tests/dragon/test_tensorflow_worker.py | 4 ++-- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py index d270e921cb..b8775ef9bc 100644 --- a/tests/dragon/test_device_manager.py +++ b/tests/dragon/test_device_manager.py @@ -143,7 +143,7 @@ def test_device_manager_model_in_request(): ) as returned_device: assert returned_device == worker_device - assert worker_device.get_model(model_key.key) == b"raw model" + assert worker_device.get_model(model_key.key).model == b'raw model' assert model_key.key not in worker_device @@ -181,6 +181,6 @@ def test_device_manager_model_key(): ) as returned_device: assert returned_device == worker_device - assert worker_device.get_model(model_key.key) == b"fetched_model" + assert worker_device.get_model(model_key.key).model == b"fetched_model" assert model_key.key in worker_device diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py index 2b2ef50f99..9099ac4b37 100644 --- a/tests/dragon/test_dragon_backend.py +++ b/tests/dragon/test_dragon_backend.py @@ -109,6 +109,7 @@ def test_dragonbackend_start_listener(the_backend: DragonBackend): comm_channel.send(event_bytes) subscriber_list = [] + logger.warning(backbone.notification_channels) # Give the channel time to write the message and the listener time to handle it for i in range(20): diff --git a/tests/dragon/test_onnx_worker.py b/tests/dragon/test_onnx_worker.py index f4103741dd..c9cfeccd26 100644 --- a/tests/dragon/test_onnx_worker.py +++ b/tests/dragon/test_onnx_worker.py @@ -45,7 +45,7 @@ from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures -from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey +from smartsim._core.mli.infrastructure.storage.feature_store import ModelKey from smartsim._core.mli.infrastructure.worker.onnx_worker import ONNXWorker from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, @@ -103,7 +103,7 @@ def get_request() -> InferenceRequest: ] return InferenceRequest( - model_key=FeatureStoreKey(key="model", descriptor="xyz"), + model_key=ModelKey(key="model", descriptor="xyz"), callback=None, raw_inputs=tensors, input_keys=None, diff --git a/tests/dragon/test_tensorflow_worker.py b/tests/dragon/test_tensorflow_worker.py index 9bbbd0090f..67014f1694 100644 --- a/tests/dragon/test_tensorflow_worker.py +++ b/tests/dragon/test_tensorflow_worker.py @@ -40,7 +40,7 @@ import dragon.globalservices.pool as dragon_gs_pool from dragon.managed_memory import MemoryAlloc, MemoryPool -from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey +from smartsim._core.mli.infrastructure.storage.feature_store import ModelKey from smartsim._core.mli.infrastructure.worker.tensorflow_worker import TensorFlowWorker from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, @@ -105,7 +105,7 @@ def get_request() -> InferenceRequest: ] return InferenceRequest( - model_key=FeatureStoreKey(key="model", descriptor="xyz"), + model_key=ModelKey(key="model", descriptor="xyz"), callback=None, raw_inputs=tensors, input_keys=None, From f23c267bde5f5fdd869f0fb3d5bf278d2c0f439f Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 14 Oct 2024 12:16:42 -0500 Subject: [PATCH 59/60] Fix tests --- smartsim/_core/mli/comm/channel/channel.py | 13 ++++++++-- .../_core/mli/comm/channel/dragon_channel.py | 25 ++++++++++++------ smartsim/_core/mli/comm/channel/dragon_fli.py | 26 +++++++++++++------ .../control/request_dispatcher.py | 2 +- .../mli/infrastructure/storage/dragon_util.py | 8 +++--- tests/dragon/test_device_manager.py | 2 +- tests/dragon/test_dragon_backend.py | 1 - tests/dragon/test_protoclient.py | 2 +- tests/dragon/utils/msg_pump.py | 4 ++- 9 files changed, 56 insertions(+), 27 deletions(-) diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index 5da120df99..2e35b7ccac 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -52,19 +52,28 @@ def __init__( """A user-friendly identifier for channel-related logging""" @abstractmethod - def send(self, value: bytes, timeout: t.Optional[float] = 0.001) -> None: + def send( + self, + value: bytes, + timeout: t.Optional[float] = 0.001, + handle_timeout: float = 0.001, + ) -> None: """Send a message through the underlying communication channel. :param value: The value to send :param timeout: Maximum time to wait (in seconds) for messages to send + :param handle_timeout: Maximum time to wait to obtain new send handle :raises SmartSimError: If sending message fails """ @abstractmethod - def recv(self, timeout: t.Optional[float] = 0.001) -> t.List[bytes]: + def recv( + self, timeout: t.Optional[float] = 0.001, handle_timeout: float = 0.001 + ) -> t.List[bytes]: """Receives message(s) through the underlying communication channel. :param timeout: Maximum time to wait (in seconds) for messages to arrive + :param handle_timeout: Maximum time to wait to obtain new receive handle :returns: The received message """ diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index 1d777681de..8b7b85b267 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -57,33 +57,42 @@ def channel(self) -> "dch.Channel": """ return self._channel - def send(self, value: bytes, timeout: t.Optional[float] = 0.001) -> None: + def send( + self, + value: bytes, + timeout: t.Optional[float] = 0.001, + handle_timeout: float = 0.001, + ) -> None: """Send a message through the underlying communication channel. :param value: The value to send - :param timeout: Maximum time to wait (in seconds) for messages to send + :param timeout: Maximum time to wait (in seconds) for messages to be sent + :param handle_timeout: Maximum time to wait to obtain new send handle :raises SmartSimError: If sending message fails """ try: - with self._channel.sendh(timeout=timeout) as sendh: - sendh.send_bytes(value, timeout=None) + with self._channel.sendh(timeout=handle_timeout) as sendh: + sendh.send_bytes(value, timeout=timeout) logger.debug(f"DragonCommChannel {self.descriptor} sent message") except Exception as e: raise SmartSimError( f"Error sending via DragonCommChannel {self.descriptor}" ) from e - def recv(self, timeout: t.Optional[float] = 0.001) -> t.List[bytes]: + def recv( + self, timeout: t.Optional[float] = 0.001, handle_timeout: float = 0.001 + ) -> t.List[bytes]: """Receives message(s) through the underlying communication channel. - :param timeout: Maximum time to wait (in seconds) for messages to arrive + :param timeout: Maximum time to wait (in seconds) for message to arrive + :param handle_timeout: Maximum time to wait to obtain new receive handle :returns: The received message(s) """ - with self._channel.recvh(timeout=timeout) as recvh: + with self._channel.recvh(timeout=handle_timeout) as recvh: messages: t.List[bytes] = [] try: - message_bytes = recvh.recv_bytes(timeout=None) + message_bytes = recvh.recv_bytes(timeout=timeout) messages.append(message_bytes) logger.debug(f"DragonCommChannel {self.descriptor} received message") except dch.ChannelEmpty: diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index e99c8bc8ae..4852f768c2 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -66,11 +66,17 @@ def __init__( self._buffer_size: int = buffer_size """Maximum number of messages that can be buffered before sending""" - def send(self, value: bytes, timeout: t.Optional[float] = 0.001) -> None: + def send( + self, + value: bytes, + timeout: t.Optional[float] = 0.001, + handle_timeout: float = 0.001, + ) -> None: """Send a message through the underlying communication channel. :param value: The value to send :param timeout: Maximum time to wait (in seconds) for messages to send + :param handle_timeout: Maximum time to wait to obtain new send handle :raises SmartSimError: If sending message fails """ try: @@ -78,9 +84,9 @@ def send(self, value: bytes, timeout: t.Optional[float] = 0.001) -> None: self._channel = drg_util.create_local(self._buffer_size) with self._fli.sendh( - timeout=timeout, stream_channel=self._channel + timeout=handle_timeout, stream_channel=self._channel ) as sendh: - sendh.send_bytes(value, timeout=None) + sendh.send_bytes(value, timeout=timeout) logger.debug(f"DragonFLIChannel {self.descriptor} sent message") except Exception as e: self._channel = None @@ -92,11 +98,13 @@ def send_multiple( self, values: t.Sequence[bytes], timeout: t.Optional[float] = 0.001, + handle_timeout: float = 0.001, ) -> None: """Send a message through the underlying communication channel. :param values: The values to send :param timeout: Maximum time to wait (in seconds) for messages to send + :param handle_timeout: Maximum time to wait to obtain new send handle :raises SmartSimError: If sending message fails """ try: @@ -104,10 +112,10 @@ def send_multiple( self._channel = drg_util.create_local(self._buffer_size) with self._fli.sendh( - timeout=timeout, stream_channel=self._channel + timeout=handle_timeout, stream_channel=self._channel ) as sendh: for value in values: - sendh.send_bytes(value, timeout=None) + sendh.send_bytes(value, timeout=timeout) logger.debug(f"DragonFLIChannel {self.descriptor} sent message") except Exception as e: self._channel = None @@ -115,7 +123,9 @@ def send_multiple( f"Error sending via DragonFLIChannel {self.descriptor} {e}" ) from e - def recv(self, timeout: t.Optional[float] = 0.001) -> t.List[bytes]: + def recv( + self, timeout: t.Optional[float] = 0.001, handle_timeout: float = 0.001 + ) -> t.List[bytes]: """Receives message(s) through the underlying communication channel. :param timeout: Maximum time to wait (in seconds) for messages to arrive @@ -124,10 +134,10 @@ def recv(self, timeout: t.Optional[float] = 0.001) -> t.List[bytes]: """ messages = [] eot = False - with self._fli.recvh(timeout=timeout) as recvh: + with self._fli.recvh(timeout=handle_timeout) as recvh: while not eot: try: - message, _ = recvh.recv_bytes(timeout=None) + message, _ = recvh.recv_bytes(timeout=timeout) messages.append(message) logger.debug(f"DragonFLIChannel {self.descriptor} received message") except fli.FLIEOT: diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index 534d7ccf07..7341df20d4 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -243,7 +243,7 @@ def __init__( raise SmartSimError("No incoming channel for dispatcher") self._incoming_channel = incoming_channel """The channel the dispatcher monitors for new tasks""" - self._outgoing_queue: DragonQueue = mp.Queue(maxsize=10000) + self._outgoing_queue: DragonQueue = mp.Queue(maxsize=1000) """The queue on which batched inference requests are placed""" self._feature_stores: t.Dict[str, FeatureStore] = {} """A collection of attached feature stores""" diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_util.py b/smartsim/_core/mli/infrastructure/storage/dragon_util.py index 50d15664c0..9a0808a8ff 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragon_util.py +++ b/smartsim/_core/mli/infrastructure/storage/dragon_util.py @@ -67,7 +67,7 @@ def create_ddict( :param num_nodes: The number of distributed nodes to distribute the dictionary to. At least one node is required. :param mgr_per_node: The number of manager processes per node - :param mem_per_node: The amount of memory (in megabytes) to allocate per node. Total + :param mem_per_node: The amount of memory (in bytes) to allocate per node. Total memory available will be calculated as `num_nodes * node_mem` :returns: The instantiated dragon dictionary @@ -84,18 +84,18 @@ def create_ddict( if mem_per_node < dragon_ddict.DDICT_MIN_SIZE: raise ValueError( "A dragon dictionary requires at least " - f"{dragon_ddict.DDICT_MIN_SIZE / 1024} MB" + f"{dragon_ddict.DDICT_MIN_SIZE / (1024**2)} MB" ) mem_total = num_nodes * mem_per_node logger.debug( - f"Creating dragon dictionary with {num_nodes} nodes, {mem_total} MB memory" + f"Creating dragon dictionary with {num_nodes} nodes, {mem_total} bytes memory" ) distributed_dict = dragon_ddict.DDict(num_nodes, mgr_per_node, total_mem=mem_total) logger.debug( "Successfully created dragon dictionary with " - f"{num_nodes} nodes, {mem_total} MB total memory" + f"{num_nodes} nodes, {mem_total} bytes total memory" ) return distributed_dict diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py index b8775ef9bc..6b22c8bd66 100644 --- a/tests/dragon/test_device_manager.py +++ b/tests/dragon/test_device_manager.py @@ -143,7 +143,7 @@ def test_device_manager_model_in_request(): ) as returned_device: assert returned_device == worker_device - assert worker_device.get_model(model_key.key).model == b'raw model' + assert worker_device.get_model(model_key.key).model == b"raw model" assert model_key.key not in worker_device diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py index 9099ac4b37..2b2ef50f99 100644 --- a/tests/dragon/test_dragon_backend.py +++ b/tests/dragon/test_dragon_backend.py @@ -109,7 +109,6 @@ def test_dragonbackend_start_listener(the_backend: DragonBackend): comm_channel.send(event_bytes) subscriber_list = [] - logger.warning(backbone.notification_channels) # Give the channel time to write the message and the listener time to handle it for i in range(20): diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py index f84417107d..008fe313df 100644 --- a/tests/dragon/test_protoclient.py +++ b/tests/dragon/test_protoclient.py @@ -196,7 +196,7 @@ def test_protoclient_initialization( assert client._to_worker_ch is not None # wrap the channels just to easily verify they produces a descriptor - assert DragonCommChannel(client._from_worker_ch).descriptor + assert DragonCommChannel(client._from_worker_ch.channel).descriptor assert DragonCommChannel(client._to_worker_ch).descriptor # confirm a publisher is created diff --git a/tests/dragon/utils/msg_pump.py b/tests/dragon/utils/msg_pump.py index 8d69e57c63..f3beaa8134 100644 --- a/tests/dragon/utils/msg_pump.py +++ b/tests/dragon/utils/msg_pump.py @@ -163,7 +163,9 @@ def _mock_messages( # send the header & body together so they arrive together try: - request_dispatcher_queue.send_multiple([request_bytes, tensor.tobytes()]) + request_dispatcher_queue.send_multiple( + [request_bytes, tensor.tobytes()], timeout=None, handle_timeout=None + ) logger.info(f"\tenvelope 0: {request_bytes[:5]}...") logger.info(f"\tenvelope 1: {tensor.tobytes()[:5]}...") except Exception as ex: From da924720a1a9c7b75312acfe815c2a37657fb4fb Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 31 Oct 2024 17:09:40 -0500 Subject: [PATCH 60/60] Complete post-merge operations --- smartsim/_core/mli/comm/channel/channel.py | 17 +++++++-- .../_core/mli/comm/channel/dragon_channel.py | 22 +++++++++--- smartsim/_core/mli/comm/channel/dragon_fli.py | 34 ++++++++++++++---- .../mli/infrastructure/worker/onnx_worker.py | 35 ++++++++++--------- .../worker/tensorflow_worker.py | 30 ++++++++-------- .../mli/infrastructure/worker/torch_worker.py | 5 +-- .../_core/mli/infrastructure/worker/worker.py | 11 +++++- smartsim/settings/arguments/launch/dragon.py | 20 ----------- 8 files changed, 105 insertions(+), 69 deletions(-) diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index 104333ce7f..afc2e65d79 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -52,19 +52,32 @@ def __init__( """A user-friendly identifier for channel-related logging""" @abstractmethod - def send(self, value: bytes, timeout: float = 0.001) -> None: + def send( + self, + value: bytes, + timeout: t.Optional[float] = 0.001, + handle_timeout: t.Optional[float] = 0.001, + ) -> None: """Send a message through the underlying communication channel. :param value: The value to send :param timeout: Maximum time to wait (in seconds) for messages to send + :param handle_timeout: Maximum time to wait (in seconds) to obtain + new send handle :raises SmartSimError: If sending message fails """ @abstractmethod - def recv(self, timeout: float = 0.001) -> t.List[bytes]: + def recv( + self, + timeout: t.Optional[float] = 0.001, + handle_timeout: t.Optional[float] = 0.001, + ) -> t.List[bytes]: """Receives message(s) through the underlying communication channel. :param timeout: Maximum time to wait (in seconds) for messages to arrive + :param handle_timeout: Maximum time to wait (in seconds) to obtain new + receive handle :returns: The received message """ diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index 110f19258a..3444c05454 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -57,29 +57,41 @@ def channel(self) -> "dch.Channel": """ return self._channel - def send(self, value: bytes, timeout: float = 0.001) -> None: + def send( + self, + value: bytes, + timeout: t.Optional[float] = 0.001, + handle_timeout: t.Optional[float] = 0.001, + ) -> None: """Send a message through the underlying communication channel. :param value: The value to send :param timeout: Maximum time to wait (in seconds) for messages to send + :param handle_timeout: Maximum time to wait (in seconds) to obtain + new send handle :raises SmartSimError: If sending message fails """ try: - with self._channel.sendh(timeout=timeout) as sendh: - sendh.send_bytes(value, blocking=False) + with self._channel.sendh(timeout=handle_timeout) as sendh: + sendh.send_bytes(value, timeout=timeout) logger.debug(f"DragonCommChannel {self.descriptor} sent message") except Exception as e: raise SmartSimError( f"Error sending via DragonCommChannel {self.descriptor}" ) from e - def recv(self, timeout: float = 0.001) -> t.List[bytes]: + def recv( + self, + timeout: t.Optional[float] = 0.001, + handle_timeout: t.Optional[float] = 0.001, + ) -> t.List[bytes]: """Receives message(s) through the underlying communication channel. :param timeout: Maximum time to wait (in seconds) for messages to arrive + :param handle_timeout: Maximum time to wait (in seconds) to obtain new :returns: The received message(s) """ - with self._channel.recvh(timeout=timeout) as recvh: + with self._channel.recvh(timeout=handle_timeout) as recvh: messages: t.List[bytes] = [] try: diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index 01849247cd..9438efc25c 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -68,18 +68,27 @@ def __init__( self._buffer_size: int = buffer_size """Maximum number of messages that can be buffered before sending""" - def send(self, value: bytes, timeout: float = 0.001) -> None: + def send( + self, + value: bytes, + timeout: t.Optional[float] = 0.001, + handle_timeout: t.Optional[float] = 0.001, + ) -> None: """Send a message through the underlying communication channel. :param value: The value to send :param timeout: Maximum time to wait (in seconds) for messages to send + :param handle_timeout: Maximum time to wait (in seconds) to obtain new + send handle :raises SmartSimError: If sending message fails """ try: if self._channel is None: self._channel = drg_util.create_local(self._buffer_size) - with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: + with self._fli.sendh( + timeout=handle_timeout, stream_channel=self._channel + ) as sendh: sendh.send_bytes(value, timeout=timeout) logger.debug(f"DragonFLIChannel {self.descriptor} sent message") except Exception as e: @@ -91,21 +100,26 @@ def send(self, value: bytes, timeout: float = 0.001) -> None: def send_multiple( self, values: t.Sequence[bytes], - timeout: float = 0.001, + timeout: t.Optional[float] = 0.001, + handle_timeout: t.Optional[float] = 0.001, ) -> None: """Send a message through the underlying communication channel. :param values: The values to send :param timeout: Maximum time to wait (in seconds) for messages to send + :param handle_timeout: Maximum time to wait (in seconds) to obtain new send + handle :raises SmartSimError: If sending message fails """ try: if self._channel is None: self._channel = drg_util.create_local(self._buffer_size) - with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: + with self._fli.sendh( + timeout=handle_timeout, stream_channel=self._channel + ) as sendh: for value in values: - sendh.send_bytes(value) + sendh.send_bytes(value, timeout=timeout) logger.debug(f"DragonFLIChannel {self.descriptor} sent message") except Exception as e: self._channel = None @@ -113,16 +127,22 @@ def send_multiple( f"Error sending via DragonFLIChannel {self.descriptor} {e}" ) from e - def recv(self, timeout: float = 0.001) -> t.List[bytes]: + def recv( + self, + timeout: t.Optional[float] = 0.001, + handle_timeout: t.Optional[float] = 0.001, + ) -> t.List[bytes]: """Receives message(s) through the underlying communication channel. :param timeout: Maximum time to wait (in seconds) for messages to arrive + :param handle_timeout: Maximum time to wait (in seconds) to obtain new + receive handle :returns: The received message(s) :raises SmartSimError: If receiving message(s) fails """ messages = [] eot = False - with self._fli.recvh(timeout=timeout) as recvh: + with self._fli.recvh(timeout=handle_timeout) as recvh: while not eot: try: message, _ = recvh.recv_bytes(timeout=timeout) diff --git a/smartsim/_core/mli/infrastructure/worker/onnx_worker.py b/smartsim/_core/mli/infrastructure/worker/onnx_worker.py index 47a9b13bff..4299863315 100644 --- a/smartsim/_core/mli/infrastructure/worker/onnx_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/onnx_worker.py @@ -38,7 +38,6 @@ from .....error import SmartSimError from .....log import get_logger -from ...mli_schemas.tensor import tensor_capnp from .worker import ( ExecuteResult, FetchInputResult, @@ -46,6 +45,7 @@ LoadModelResult, MachineLearningWorkerBase, RequestBatch, + TensorMeta, TransformInputResult, TransformOutputResult, ) @@ -120,51 +120,52 @@ def load_model( @staticmethod def transform_input( batch: RequestBatch, - fetch_results: list[FetchInputResult], + fetch_results: FetchInputResult, mem_pool: MemoryPool, ) -> TransformInputResult: """Given a collection of data, perform a transformation on the data and put the raw tensor data on a MemoryPool allocation. - :param batch: The request batch that triggered the pipeline - :param fetch_result: Raw outputs from fetching inputs + :param batch: The batch that triggered the pipeline + :param fetch_result: Raw outputs from fetching inputs from feature store or + request :param mem_pool: The memory pool used to access batched input tensors :returns: The transformed inputs wrapped in a TransformInputResult :raises ValueError: If tensors cannot be reconstructed :raises IndexError: If index out of range """ - results: list[memoryview] = [] + results: list[bytes] = [] total_samples = 0 slices: list[slice] = [] all_dims: list[list[int]] = [] all_dtypes: list[str] = [] - if fetch_results[0].meta is None: + if fetch_results.meta is None: raise ValueError("Cannot reconstruct tensor without meta information") # Traverse inputs to get total number of samples and compute slices # Assumption: first dimension is samples, all tensors in the same input # have same number of samples # thus we only look at the first tensor for each input - for res_idx, fetch_result in enumerate(fetch_results): - if fetch_result.meta is None or any( - item_meta is None for item_meta in fetch_result.meta + for res_idx, res_meta_list in enumerate(fetch_results.meta): + if res_meta_list is None or any( + item_meta is None for item_meta in res_meta_list ): raise ValueError("Cannot reconstruct tensor without meta information") - first_tensor_desc: tensor_capnp.TensorDescriptor = fetch_result.meta[0] + first_tensor_desc: TensorMeta = res_meta_list[0] # type: ignore num_samples = first_tensor_desc.dimensions[0] slices.append(slice(total_samples, total_samples + num_samples)) total_samples = total_samples + num_samples - if res_idx == len(fetch_results) - 1: + if res_idx == len(fetch_results.meta) - 1: # For each tensor in the last input, get remaining dimensions # Assumptions: all inputs have the same number of tensors and # last N-1 dimensions match across inputs for corresponding tensors # thus: resulting array will be of size (num_samples, all_other_dims) - for item_meta in fetch_result.meta: - tensor_desc: tensor_capnp.TensorDescriptor = item_meta - tensor_dims = list(tensor_desc.dimensions) + for item_meta in res_meta_list: + tensor_desc: TensorMeta = item_meta # type: ignore + tensor_dims = tensor_desc.dimensions all_dims.append([total_samples, *tensor_dims[1:]]) - all_dtypes.append(str(tensor_desc.dataType)) + all_dtypes.append(tensor_desc.datatype) for result_tensor_idx, (dims, dtype) in enumerate(zip(all_dims, all_dtypes)): itemsize = np.empty((1), dtype=dtype).itemsize @@ -174,8 +175,8 @@ def transform_input( try: joined = b"".join( [ - fetch_result.inputs[result_tensor_idx] - for fetch_result in fetch_results + fetch_result[result_tensor_idx] + for fetch_result in fetch_results.inputs ] ) mem_view[:alloc_size] = joined diff --git a/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py b/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py index d532cc160b..bd1f8b7cee 100644 --- a/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/tensorflow_worker.py @@ -41,7 +41,6 @@ from .....error import SmartSimError from .....log import get_logger -from ...mli_schemas.tensor import tensor_capnp from .worker import ( ExecuteResult, FetchInputResult, @@ -49,6 +48,7 @@ LoadModelResult, MachineLearningWorkerBase, RequestBatch, + TensorMeta, TransformInputResult, TransformOutputResult, ) @@ -148,7 +148,7 @@ def load_model( @staticmethod def transform_input( batch: RequestBatch, - fetch_results: list[FetchInputResult], + fetch_results: FetchInputResult, mem_pool: MemoryPool, ) -> TransformInputResult: """Given a collection of data, perform a transformation on the data and put @@ -161,38 +161,38 @@ def transform_input( :raises ValueError: If tensors cannot be reconstructed :raises IndexError: If index out of range """ - results: list[memoryview] = [] + results: list[bytes] = [] total_samples = 0 slices: list[slice] = [] all_dims: list[list[int]] = [] all_dtypes: list[str] = [] - if fetch_results[0].meta is None: + if fetch_results.meta is None: raise ValueError("Cannot reconstruct tensor without meta information") # Traverse inputs to get total number of samples and compute slices # Assumption: first dimension is samples, all tensors in the same input # have same number of samples # thus we only look at the first tensor for each input - for res_idx, fetch_result in enumerate(fetch_results): - if fetch_result.meta is None or any( - item_meta is None for item_meta in fetch_result.meta + for res_idx, res_meta_list in enumerate(fetch_results.meta): + if res_meta_list is None or any( + item_meta is None for item_meta in res_meta_list ): raise ValueError("Cannot reconstruct tensor without meta information") - first_tensor_desc: tensor_capnp.TensorDescriptor = fetch_result.meta[0] + first_tensor_desc: TensorMeta = res_meta_list[0] # type: ignore num_samples = first_tensor_desc.dimensions[0] slices.append(slice(total_samples, total_samples + num_samples)) total_samples = total_samples + num_samples - if res_idx == len(fetch_results) - 1: + if res_idx == len(fetch_results.meta) - 1: # For each tensor in the last input, get remaining dimensions # Assumptions: all inputs have the same number of tensors and # last N-1 dimensions match across inputs for corresponding tensors # thus: resulting array will be of size (num_samples, all_other_dims) - for item_meta in fetch_result.meta: - tensor_desc: tensor_capnp.TensorDescriptor = item_meta - tensor_dims = list(tensor_desc.dimensions) + for item_meta in res_meta_list: + tensor_desc: TensorMeta = item_meta # type: ignore + tensor_dims = tensor_desc.dimensions all_dims.append([total_samples, *tensor_dims[1:]]) - all_dtypes.append(str(tensor_desc.dataType)) + all_dtypes.append(tensor_desc.datatype) for result_tensor_idx, (dims, dtype) in enumerate(zip(all_dims, all_dtypes)): itemsize = np.empty((1), dtype=dtype).itemsize @@ -202,8 +202,8 @@ def transform_input( try: joined = b"".join( [ - fetch_result.inputs[result_tensor_idx] - for fetch_result in fetch_results + fetch_result[result_tensor_idx] + for fetch_result in fetch_results.inputs ] ) mem_view[:alloc_size] = joined diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index f8d6e7c2de..4642369ef5 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -105,13 +105,14 @@ def transform_input( the raw tensor data on a MemoryPool allocation. :param batch: The batch that triggered the pipeline - :param fetch_result: Raw outputs from fetching inputs out of a feature store + :param fetch_result: Raw outputs from fetching inputs from feature store or + request :param mem_pool: The memory pool used to access batched input tensors :returns: The transformed inputs wrapped in a TransformInputResult :raises ValueError: If tensors cannot be reconstructed :raises IndexError: If index out of range """ - results: list[torch.Tensor] = [] + results: list[bytes] = [] total_samples = 0 slices: list[slice] = [] diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index b122a1d9ba..96aaffd85c 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -208,13 +208,22 @@ class TensorMeta: class LoadModelResult: """A wrapper around a loaded model.""" - def __init__(self, model: t.Any) -> None: + def __init__( + self, + model: t.Any, + inputs: t.Optional[t.List[str]] = None, + outputs: t.Optional[t.List[str]] = None, + ) -> None: """Initialize the LoadModelResult. :param model: The loaded model """ self.model = model """The loaded model (e.g. a TensorFlow, PyTorch, ONNX, etc. model)""" + self.inputs = inputs + """List of input layer names, only used in TensorFlow""" + self.outputs = outputs + """List of output tensor names, only used in TensorFlow""" class TransformInputResult: diff --git a/smartsim/settings/arguments/launch/dragon.py b/smartsim/settings/arguments/launch/dragon.py index 5f7f28a0e6..d8044267e6 100644 --- a/smartsim/settings/arguments/launch/dragon.py +++ b/smartsim/settings/arguments/launch/dragon.py @@ -104,26 +104,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: raise ValueError(f"invalid names found in hostlist: {host_list}") self.set("host-list", ",".join(cleaned_list)) - @override - def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: - """Specify the hostlist for this job - - :param host_list: hosts to launch on - :raises ValueError: if an empty host list is supplied - """ - if not host_list: - raise ValueError("empty hostlist provided") - - if isinstance(host_list, str): - host_list = host_list.replace(" ", "").split(",") - - # strip out all whitespace-only values - cleaned_list = [host.strip() for host in host_list if host and host.strip()] - if not len(cleaned_list) == len(host_list): - raise ValueError(f"invalid names found in hostlist: {host_list}") - - self.run_args["host-list"] = ",".join(cleaned_list) - def set_cpu_affinity(self, devices: t.List[int]) -> None: """Set the CPU affinity for this job