Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions src/getml_io/getml/scores.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from collections.abc import Sequence
from datetime import datetime
from typing import Annotated, Literal

from pydantic import BaseModel, Field


class _Score(BaseModel):
date_time: datetime
set_used: str
target: str


class ClassificationScore(_Score):
accuracy: float
auc: float
cross_entropy: float
type: Literal["classification"] = "classification"


class RegressionScore(_Score):
mae: float
rmse: float
rsquared: float
type: Literal["regression"] = "regression"


Score = Annotated[
ClassificationScore | RegressionScore,
Field(discriminator="type"),
]

Scores = Sequence[ClassificationScore] | Sequence[RegressionScore]
Comment thread
Urfoex marked this conversation as resolved.
Comment thread
Urfoex marked this conversation as resolved.
3 changes: 2 additions & 1 deletion src/getml_io/metadata/pipeline_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from getml_io.getml.features import Features
from getml_io.getml.predictors import FeatureSelector, Predictor
from getml_io.getml.preprocessors import Preprocessor
from getml_io.getml.scores import Scores
from getml_io.metadata.data_model_information import DataModelInformation
from getml_io.metadata.dataframe_information import DataFrameInformationByName
from getml_io.metadata.placeholder_information import PlaceholderInformation
Expand Down Expand Up @@ -44,7 +45,7 @@ class PipelineInformation(BaseModel):
targets: Sequence[str]
data_model: DataModelInformation
features: Features
# scores # TODO @urfoex: #18
scores: Scores
# columns # TODO @urfoex: #50
# metadata # TODO @urfoex: #51
# tables # TODO @urfoex: #52
18 changes: 18 additions & 0 deletions src/getml_io/serialize/exception.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from pathlib import Path

from getml.pipeline.score import Score as GetMLScore

from getml_io.getml.roles import Role
from getml_io.metadata.dataframe_information import (
ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING,
Expand Down Expand Up @@ -91,3 +93,19 @@ def __init__(
f"Supported are: {list(ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING.keys())}."
)
super().__init__(message)


class WrongPipelineScoreTypeError(GetMLIOError):
"""Exception raised when the type of a score does not match the expected type."""

def __init__(
self,
expected_type: type[GetMLScore],
received_type: type[GetMLScore],
Comment thread
Urfoex marked this conversation as resolved.
) -> None:
"""Initialize the exception with a custom message."""
message = (
f"Expected score type {expected_type.__name__}, "
f"but received {received_type.__name__}."
)
super().__init__(message)
69 changes: 68 additions & 1 deletion src/getml_io/serialize/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
)
from getml.pipeline import Features as GetMLFeatures
from getml.pipeline import Pipeline
from getml.pipeline import Scores as GetMLScores
from getml.pipeline.score import ClassificationScore as GetMLClassificationScore
from getml.pipeline.score import RegressionScore as GetMLRegressionScore
from getml.pipeline.score import Score as GetMLScore
from numpy.typing import NDArray

from getml_io.getml.feature_learning import (
Expand Down Expand Up @@ -43,6 +47,7 @@
Substring,
TextFieldSplitter,
)
from getml_io.getml.scores import ClassificationScore, RegressionScore, Scores
from getml_io.metadata.dataframe_information import DataFrameInformationByName
from getml_io.metadata.pipeline_information import (
LossFunction,
Expand All @@ -51,6 +56,7 @@
from getml_io.serialize.data_model import serialize_data_model
from getml_io.serialize.dataframe_information import derive_instances_with_relative_path
from getml_io.serialize.dataframe_or_view import serialize_dataframe_or_view
from getml_io.serialize.exception import WrongPipelineScoreTypeError
from getml_io.serialize.pipeline_information import serialize_pipeline_information
from getml_io.serialize.placeholder import serialize_placeholder
from getml_io.utils.convert import (
Expand Down Expand Up @@ -123,7 +129,7 @@ def serialize_pipeline(
targets=pipeline.targets,
data_model=serialize_data_model(pipeline.data_model),
features=serialize_features(pipeline.features),
# scores # TODO @urfoex: #18
scores=serialize_scores(pipeline.scores),
# columns # TODO @urfoex: #50
# metadata # TODO @urfoex: #51
# tables # TODO @urfoex: #52
Expand Down Expand Up @@ -331,3 +337,64 @@ def serialize_features(features: GetMLFeatures) -> Features:
)
for feature in features
}


def serialize_scores(scores: GetMLScores) -> Scores:
"""Serialize getML Scores into a Scores object.

Args:
scores: The getML Scores to serialize.

Returns:
Scores: The serialized Scores information.

"""
return (
_serialize_classification_scores(list(scores))
if scores.is_classification
else _serialize_regression_scores(list(scores))
)
Comment thread
Urfoex marked this conversation as resolved.


def _serialize_classification_scores(
scores: list[GetMLScore],
) -> list[ClassificationScore]:
classification_scores: list[ClassificationScore] = []
for score in scores:
if not isinstance(score, GetMLClassificationScore):
raise WrongPipelineScoreTypeError(
GetMLClassificationScore,
type(score),
)
classification_scores.append(
ClassificationScore(
date_time=score.date_time,
set_used=score.set_used,
target=score.target,
accuracy=score.accuracy,
auc=score.auc,
cross_entropy=score.cross_entropy,
),
)
return classification_scores


def _serialize_regression_scores(scores: list[GetMLScore]) -> list[RegressionScore]:
regression_scores: list[RegressionScore] = []
for score in scores:
if not isinstance(score, GetMLRegressionScore):
raise WrongPipelineScoreTypeError(
GetMLRegressionScore,
type(score),
)
regression_scores.append(
RegressionScore(
date_time=score.date_time,
set_used=score.set_used,
target=score.target,
mae=score.mae,
rmse=score.rmse,
rsquared=score.rsquared,
),
)
return regression_scores
25 changes: 25 additions & 0 deletions tests/integration/assertions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pathlib import Path

from getml_io.getml.features import Features
from getml_io.getml.scores import Scores
from getml_io.metadata.container_information import ContainerInformation
from getml_io.metadata.dataframe_information import (
ColumnProfile,
Expand Down Expand Up @@ -213,6 +214,10 @@ def assert_pipeline_information(
pipeline_information.features,
expected_pipeline_information.features,
)
assert_scores(
pipeline_information.scores,
expected_pipeline_information.scores,
)


def assert_features(
Expand All @@ -229,3 +234,23 @@ def assert_features(
assert feature.sql is not None
assert feature.importance is not None
assert feature.correlation is not None


def assert_scores(
scores: Scores,
expected_scores: Scores,
) -> None:
assert len(scores) == len(expected_scores)
for score, expected_score in zip(scores, expected_scores, strict=True):
assert score.type == expected_score.type
assert score.target == expected_score.target
assert score.date_time is not None
assert score.set_used == expected_score.set_used
if score.type == "classification":
assert score.accuracy is not None
assert score.auc is not None
assert score.cross_entropy is not None
elif score.type == "regression":
assert score.mae is not None
assert score.rmse is not None
assert score.rsquared is not None
Comment thread
Urfoex marked this conversation as resolved.
44 changes: 32 additions & 12 deletions tests/integration/data/loans/expected.pipeline.json
Original file line number Diff line number Diff line change
Expand Up @@ -631,11 +631,11 @@
"feature_learners": [
{
"aggregation": [
"SUM",
"COUNT",
"AVG",
"SUM",
"MAX",
"MIN",
"COUNT"
"MIN"
],
"allow_sets": true,
"delta_t": 0.0,
Expand All @@ -649,19 +649,19 @@
"num_threads": 0,
"propositionalization": {
"aggregation": [
"SUM",
"AVG",
"MEDIAN",
"TREND",
"COUNT MINUS COUNT DISTINCT",
"MODE",
"STDDEV",
"MAX",
"MIN",
"COUNT",
"AVG",
"TREND",
"COUNT DISTINCT",
"SUM",
"FIRST",
"MODE",
"LAST",
"COUNT"
"COUNT MINUS COUNT DISTINCT",
"MAX",
"MIN"
],
"delta_t": 0.0,
"loss_function": "CrossEntropyLoss",
Expand Down Expand Up @@ -1062,5 +1062,25 @@
"correlation": 0.06660193285904141,
"sql": ""
}
}
},
"scores": [
{
"date_time": "2025-08-19T22:33:06",
"set_used": "train",
"target": "default",
"accuracy": 0.9825708061002179,
"auc": 0.9952295229522934,
"cross_entropy": 0.08200917844672241,
"type": "classification"
},
{
"date_time": "2025-08-19T22:33:07",
"set_used": "test",
"target": "default",
"accuracy": 0.9551569506726457,
"auc": 0.8903818953323912,
"cross_entropy": 0.1751293856880503,
"type": "classification"
}
]
}
38 changes: 29 additions & 9 deletions tests/integration/data/numerical/expected.pipeline.json
Original file line number Diff line number Diff line change
Expand Up @@ -570,19 +570,19 @@
"num_threads": 0,
"propositionalization": {
"aggregation": [
"SUM",
"AVG",
"MEDIAN",
"TREND",
"COUNT MINUS COUNT DISTINCT",
"MODE",
"STDDEV",
"MAX",
"MIN",
"COUNT",
"AVG",
"TREND",
"COUNT DISTINCT",
"SUM",
"FIRST",
"MODE",
"LAST",
"COUNT"
"COUNT MINUS COUNT DISTINCT",
"MAX",
"MIN"
],
"delta_t": 0.0,
"loss_function": "SquareLoss",
Expand Down Expand Up @@ -817,5 +817,25 @@
"correlation": -0.09593210355580734,
"sql": ""
}
}
},
"scores": [
{
"date_time": "2025-08-19T22:33:16",
"set_used": "train",
"target": "targets",
"mae": 0.16209001296605818,
"rmse": 0.25502448762753166,
"rsquared": 0.999961056217837,
"type": "regression"
},
{
"date_time": "2025-08-19T22:33:18",
"set_used": "test",
"target": "targets",
"mae": 0.47556934465061534,
"rmse": 0.6929843656362203,
"rsquared": 0.9996960826789243,
"type": "regression"
}
]
}
Loading
Loading