Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ dependencies = [
"python-multipart>=0.0.16",
"filelock",
"psutil",
"gpuhunt==0.1.11",
# TODO: release and pin new version
"gpuhunt @ https://github.com/dstackai/gpuhunt/archive/refs/heads/hotaisle_store_specs_in_provider_data.zip",
"argcomplete>=3.5.0",
"ignore-python>=0.2.0",
"orjson",
Expand Down Expand Up @@ -67,6 +68,9 @@ artifacts = [
"src/dstack/_internal/server/statics/**",
]

[tool.hatch.metadata]
allow-direct-references = true # TODO: unset

[tool.hatch.metadata.hooks.fancy-pypi-readme]
content-type = "text/markdown"

Expand Down
108 changes: 25 additions & 83 deletions src/dstack/_internal/core/backends/hotaisle/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import subprocess
import tempfile
from threading import Thread
from typing import List, Optional
from typing import Any, List, Optional

import gpuhunt
from gpuhunt.providers.hotaisle import HotAisleProvider
Expand All @@ -22,6 +22,7 @@
from dstack._internal.core.models.instances import (
InstanceAvailability,
InstanceConfiguration,
InstanceOffer,
InstanceOfferWithAvailability,
)
from dstack._internal.core.models.placement import PlacementGroup
Expand All @@ -31,48 +32,7 @@
logger = get_logger(__name__)


INSTANCE_TYPE_SPECS = {
"1x MI300X 8x Xeon Platinum 8462Y+": {
"cpu_model": "Xeon Platinum 8462Y+",
"cpu_frequency": 2800000000,
"cpu_manufacturer": "Intel",
},
"1x MI300X 13x Xeon Platinum 8470": {
"cpu_model": "Xeon Platinum 8470",
"cpu_frequency": 2000000000,
"cpu_manufacturer": "Intel",
},
"2x MI300X 26x Xeon Platinum 8470": {
"cpu_model": "Xeon Platinum 8470",
"cpu_frequency": 2000000000,
"cpu_manufacturer": "Intel",
},
"2x MI300X 26x Xeon Platinum 8462Y+": {
"cpu_model": "Xeon Platinum 8462Y+",
"cpu_frequency": 2800000000,
"cpu_manufacturer": "Intel",
},
"4x MI300X 52x Xeon Platinum 8470": {
"cpu_model": "Xeon Platinum 8470",
"cpu_frequency": 2000000000,
"cpu_manufacturer": "Intel",
},
"4x MI300X 52x Xeon Platinum 8462Y+": {
"cpu_model": "Xeon Platinum 8462Y+",
"cpu_frequency": 2800000000,
"cpu_manufacturer": "Intel",
},
"8x MI300X 104x Xeon Platinum 8470": {
"cpu_model": "Xeon Platinum 8470",
"cpu_frequency": 2000000000,
"cpu_manufacturer": "Intel",
},
"8x MI300X 104x Xeon Platinum 8462Y+": {
"cpu_model": "Xeon Platinum 8462Y+",
"cpu_frequency": 2800000000,
"cpu_manufacturer": "Intel",
},
}
SUPPORTED_GPUS = ["MI300X"]


class HotAisleCompute(
Expand All @@ -95,45 +55,15 @@ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability
backend=BackendType.HOTAISLE,
locations=self.config.regions or None,
catalog=self.catalog,
extra_filter=_supported_instances,
)
supported_offers = []
for offer in offers:
if offer.instance.name in INSTANCE_TYPE_SPECS:
supported_offers.append(
InstanceOfferWithAvailability(
**offer.dict(), availability=InstanceAvailability.AVAILABLE
)
)
else:
logger.warning(
f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}"
)
return supported_offers

def get_payload_from_offer(self, instance_type) -> dict:
instance_type_name = instance_type.name
cpu_specs = INSTANCE_TYPE_SPECS[instance_type_name]
cpu_cores = instance_type.resources.cpus

return {
"cpu_cores": cpu_cores,
"cpus": {
"count": 1,
"manufacturer": cpu_specs["cpu_manufacturer"],
"model": cpu_specs["cpu_model"],
"cores": cpu_cores,
"frequency": cpu_specs["cpu_frequency"],
},
"disk_capacity": instance_type.resources.disk.size_mib * 1024**2,
"ram_capacity": instance_type.resources.memory_mib * 1024**2,
"gpus": [
{
"count": len(instance_type.resources.gpus),
"manufacturer": instance_type.resources.gpus[0].vendor,
"model": instance_type.resources.gpus[0].name,
}
],
}
return [
InstanceOfferWithAvailability(
**offer.dict(),
availability=InstanceAvailability.AVAILABLE,
)
for offer in offers
]

def create_instance(
self,
Expand All @@ -143,8 +73,10 @@ def create_instance(
) -> JobProvisioningData:
project_ssh_key = instance_config.ssh_keys[0]
self.api_client.upload_ssh_key(project_ssh_key.public)
vm_payload = self.get_payload_from_offer(instance_offer.instance)
vm_data = self.api_client.create_virtual_machine(vm_payload)
offer_backend_data: HotAisleOfferBackendData = (
HotAisleOfferBackendData.__response__.parse_obj(instance_offer.backend_data)
)
vm_data = self.api_client.create_virtual_machine(offer_backend_data.vm_specs)
return JobProvisioningData(
backend=instance_offer.backend,
instance_type=instance_offer.instance,
Expand Down Expand Up @@ -240,10 +172,20 @@ def _run_ssh_command(hostname: str, ssh_private_key: str, command: str):
)


def _supported_instances(offer: InstanceOffer) -> bool:
return len(offer.instance.resources.gpus) > 0 and all(
gpu.name in SUPPORTED_GPUS for gpu in offer.instance.resources.gpus
)


class HotAisleInstanceBackendData(CoreModel):
ip_address: str

@classmethod
def load(cls, raw: Optional[str]) -> "HotAisleInstanceBackendData":
assert raw is not None
return cls.__response__.parse_raw(raw)


class HotAisleOfferBackendData(CoreModel):
vm_specs: dict[str, Any]