From 5e2b7c0b44ab4d100c11f9a1d153ad6c1b42ebc6 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Sat, 30 Nov 2024 09:07:28 +0200
Subject: [PATCH] Break up task validation and setting defaults
---
amlb/resources.py | 84 +++++++++++--------
.../resources/test_benchmark_definition.py | 39 ++++-----
2 files changed, 62 insertions(+), 61 deletions(-)
diff --git a/amlb/resources.py b/amlb/resources.py
index e4951c5cd..03bcbc58c 100644
--- a/amlb/resources.py
+++ b/amlb/resources.py
@@ -229,30 +229,19 @@ def _benchmark_definition(
)
for task in tasks:
task |= defaults # add missing keys from hard defaults + defaults
- Resources._validate_task(task, config_)
+ Resources._validate_task(task)
+ Resources._add_task_defaults(task, config_)
- Resources._validate_task(defaults, config_, lenient=True)
+ Resources._add_task_defaults(defaults, config_)
defaults.enabled = False
tasks.append(defaults)
log.debug("Available task definitions:\n%s", tasks)
return tasks, benchmark_name, benchmark_path
@staticmethod
- def _validate_task(task: Namespace, config_: Namespace, lenient: bool = False):
- if not lenient and task["name"] is None:
- raise ValueError(
- f"`name` is mandatory but missing in task definition {task}."
- )
-
+ def _add_task_defaults(task: Namespace, config_: Namespace):
if task["id"] is None:
task["id"] = Resources.generate_task_identifier(task)
- if not lenient and task["id"] is None:
- raise ValueError(
- "task definition must contain an ID or one property "
- "among ['openml_task_id', 'dataset'] to create an ID, "
- "but task definition is {task}".format(task=str(task))
- )
-
for conf in [
"max_runtime_seconds",
"cores",
@@ -265,41 +254,62 @@ def _validate_task(task: Namespace, config_: Namespace, lenient: bool = False):
task[conf] = config_.benchmarks.defaults[conf]
log.debug(
"Config `{config}` not set for task {name}, using default `{value}`.".format(
- config=conf, name=task.name, value=task[conf]
+ config=conf, name=task["name"], value=task[conf]
)
)
- conf = "ec2_instance_type"
- if task[conf] is None:
- i_series = config_.aws.ec2.instance_type.series
- i_map = config_.aws.ec2.instance_type.map
- if str(task.cores) in i_map:
- i_size = i_map[str(task.cores)]
- elif task.cores > 0:
- supported_cores = list(
- map(int, Namespace.dict(i_map).keys() - {"default"})
- )
- supported_cores.sort()
- cores = next((c for c in supported_cores if c >= task.cores), "default")
- i_size = i_map[str(cores)]
- else:
- i_size = i_map.default
- task[conf] = ".".join([i_series, i_size])
+ if task["ec2_instance_type"] is None:
+ task["ec2_instance_type"] = Resources.lookup_ec2_instance_type(
+ config_, task.cores
+ )
log.debug(
"Config `{config}` not set for task {name}, using default selection `{value}`.".format(
- config=conf, name=task.name, value=task[conf]
+ config=conf, name=task["name"], value=task["ec2_instance_type"]
)
)
- conf = "ec2_volume_type"
- if task[conf] is None:
- task[conf] = config_.aws.ec2.volume_type
+ if task["ec2_volume_type"] is None:
+ task["ec2_volume_type"] = config_.aws.ec2.volume_type
log.debug(
"Config `{config}` not set for task {name}, using default `{value}`.".format(
- config=conf, name=task.name, value=task[conf]
+ config=conf, name=task["name"], value=task["ec2_volume_type"]
)
)
+ @staticmethod
+ def _validate_task(task: Namespace) -> None:
+ """Raises ValueError if task does not have a name and a way to generate an identifier."""
+ if task["name"] is None:
+ raise ValueError(
+ f"`name` is mandatory but missing in task definition {task}."
+ )
+ task_id = Namespace.get(task, "id", Resources.generate_task_identifier(task))
+ if task_id is None:
+ raise ValueError(
+ "task definition must contain an ID or one property "
+ "among ['openml_task_id', 'dataset'] to create an ID, "
+ "but task definition is {task}".format(task=str(task))
+ )
+
+ @staticmethod
+ def lookup_ec2_instance_type(config_: Namespace, cores: int) -> str:
+ i_series = config_.aws.ec2.instance_type.series
+ i_map = config_.aws.ec2.instance_type.map
+ i_size = Resources.lookup_suitable_instance_size(i_map, cores)
+ return f"{i_series}.{i_size}"
+
+ @staticmethod
+ def lookup_suitable_instance_size(cores_to_size: Namespace, cores: int) -> str:
+ if str(cores) in cores_to_size:
+ return cores_to_size[str(cores)]
+
+ supported_cores = list(map(int, set(dir(cores_to_size)) - {"default"}))
+ if cores <= 0 or cores > max(supported_cores):
+ return cores_to_size.default
+
+ cores = next((c for c in sorted(supported_cores) if c >= cores), "default")
+ return cores_to_size[str(cores)]
+
@staticmethod
def generate_task_identifier(task: Namespace) -> str | None:
if task["openml_task_id"] is not None:
diff --git a/tests/unit/amlb/resources/test_benchmark_definition.py b/tests/unit/amlb/resources/test_benchmark_definition.py
index e8874cfaa..4b452ce02 100644
--- a/tests/unit/amlb/resources/test_benchmark_definition.py
+++ b/tests/unit/amlb/resources/test_benchmark_definition.py
@@ -1,5 +1,3 @@
-from functools import partial
-
import pytest
from amlb import Resources
@@ -34,20 +32,13 @@ def amlb_dummy_configuration() -> Namespace:
def test_validate_task_strict_requires_name():
with pytest.raises(ValueError) as excinfo:
- Resources._validate_task(
- task=Namespace(),
- config_=Namespace(),
- lenient=False,
- )
+ Resources._validate_task(task=Namespace())
assert "mandatory but missing" in excinfo.value.args[0]
def test_validate_task_strict_requires_id(amlb_dummy_configuration: Namespace):
- strict_validate = partial(
- Resources._validate_task, config_=amlb_dummy_configuration, lenient=False
- )
with pytest.raises(ValueError) as excinfo:
- strict_validate(task=Namespace(name="foo"))
+ Resources._validate_task(task=Namespace(name="foo"))
assert "must contain an ID or one property" in excinfo.value.args[0]
@@ -61,17 +52,17 @@ def test_validate_task_strict_requires_id(amlb_dummy_configuration: Namespace):
(Namespace(dataset=Namespace(id="bar")), "bar"),
],
)
-def test_validate_task_id_formatting(
+def test_add_task_defaults_formatting(
properties: Namespace, expected: str, amlb_dummy_configuration: Namespace
):
task = Namespace(name="foo") | properties
- Resources._validate_task(task=task, config_=amlb_dummy_configuration)
+ Resources._add_task_defaults(task=task, config_=amlb_dummy_configuration)
assert task["id"] == expected
-def test_validate_task_adds_benchmark_defaults(amlb_dummy_configuration: Namespace):
- task = Namespace(name=None)
- Resources._validate_task(task, amlb_dummy_configuration, lenient=True)
+def test_add_task_defaults_sets_benchmark_defaults(amlb_dummy_configuration: Namespace):
+ task = Namespace()
+ Resources._add_task_defaults(task, amlb_dummy_configuration)
config = Namespace.dict(amlb_dummy_configuration, deep=True)
for setting, default in config["benchmarks"]["defaults"].items():
@@ -79,9 +70,9 @@ def test_validate_task_adds_benchmark_defaults(amlb_dummy_configuration: Namespa
assert task["ec2_volume_type"] == amlb_dummy_configuration.aws.ec2.volume_type
-def test_validate_task_does_not_overwrite(amlb_dummy_configuration: Namespace):
- task = Namespace(name=None, cores=42)
- Resources._validate_task(task, amlb_dummy_configuration, lenient=True)
+def test_add_task_defaults_does_not_overwrite(amlb_dummy_configuration: Namespace):
+ task = Namespace(cores=42)
+ Resources._add_task_defaults(task, amlb_dummy_configuration)
config = Namespace.dict(amlb_dummy_configuration, deep=True)
assert task.cores == 42
@@ -90,31 +81,31 @@ def test_validate_task_does_not_overwrite(amlb_dummy_configuration: Namespace):
assert task[setting] == default
-def test_validate_task_looks_up_instance_type(amlb_dummy_configuration: Namespace):
+def test_add_task_defaults_looks_up_instance_type(amlb_dummy_configuration: Namespace):
instance_type = amlb_dummy_configuration.aws.ec2.instance_type
reverse_size_map = {v: k for k, v in Namespace.dict(instance_type.map).items()}
n_cores_for_small = int(reverse_size_map["small"])
task = Namespace(name="foo", cores=n_cores_for_small)
- Resources._validate_task(task, amlb_dummy_configuration, lenient=True)
+ Resources._add_task_defaults(task, amlb_dummy_configuration)
assert (
task["ec2_instance_type"] == "m5.small"
), "Should resolve to the instance type with the exact amount of cores"
task = Namespace(name="foo", cores=n_cores_for_small - 1)
- Resources._validate_task(task, amlb_dummy_configuration, lenient=True)
+ Resources._add_task_defaults(task, amlb_dummy_configuration)
assert (
task["ec2_instance_type"] == "m5.small"
), "If exact amount of cores are not available, should resolve to next biggest"
task = Namespace(name="foo", cores=n_cores_for_small + 1)
- Resources._validate_task(task, amlb_dummy_configuration, lenient=True)
+ Resources._add_task_defaults(task, amlb_dummy_configuration)
assert (
task["ec2_instance_type"] == "m5.large"
), "If bigger than largest in map, should revert to default"
task = Namespace(name="foo", ec2_instance_type="bar")
- Resources._validate_task(task, amlb_dummy_configuration, lenient=True)
+ Resources._add_task_defaults(task, amlb_dummy_configuration)
assert (
task["ec2_instance_type"] == "bar"
), "Should not overwrite explicit configuration"