Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions src/getml_io/getml/features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from collections.abc import Mapping
from typing import ClassVar

from pydantic import BaseModel, ConfigDict


class Feature(BaseModel):
model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True)

name: str
index: int
target: str
importance: float
correlation: float
sql: str


Features = Mapping[str, Feature]
3 changes: 2 additions & 1 deletion src/getml_io/metadata/pipeline_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pydantic import BaseModel, ConfigDict

from getml_io.getml.feature_learning import FeatureLearner
from getml_io.getml.features import Features
from getml_io.getml.predictors import FeatureSelector, Predictor
from getml_io.getml.preprocessors import Preprocessor
from getml_io.metadata.data_model_information import DataModelInformation
Expand Down Expand Up @@ -42,7 +43,7 @@ class PipelineInformation(BaseModel):
tags: Sequence[str]
targets: Sequence[str]
data_model: DataModelInformation
# features # TODO @urfoex: #17
features: Features
# scores # TODO @urfoex: #18
# columns # TODO @urfoex: #50
# metadata # TODO @urfoex: #51
Expand Down
27 changes: 26 additions & 1 deletion src/getml_io/serialize/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
Container,
DataFrame,
)
from getml.pipeline import Features as GetMLFeatures
from getml.pipeline import Pipeline
from numpy.typing import NDArray

Expand All @@ -22,6 +23,7 @@
Relboost,
RelMT,
)
from getml_io.getml.features import Feature, Features
from getml_io.getml.predictors import (
LinearRegression,
LogisticRegression,
Expand Down Expand Up @@ -120,7 +122,7 @@ def serialize_pipeline(
tags=pipeline.tags,
targets=pipeline.targets,
data_model=serialize_data_model(pipeline.data_model),
# features # TODO @urfoex: #17
features=serialize_features(pipeline.features),
# scores # TODO @urfoex: #18
# columns # TODO @urfoex: #50
# metadata # TODO @urfoex: #51
Expand Down Expand Up @@ -306,3 +308,26 @@ def serialize_preprocessor( # noqa: PLR0911
return Substring.model_validate(preprocessor_as_dict)
case getml_preprocessor.TextFieldSplitter():
return TextFieldSplitter.model_validate(preprocessor_as_dict)


def serialize_features(features: GetMLFeatures) -> Features:
"""Serialize getML Features into a Features object.

Args:
features: The getML Features to serialize.

Returns:
Features: The serialized Features information.

"""
return {
feature.name: Feature(
name=feature.name,
index=feature.index,
target=feature.target,
importance=feature.importance,
correlation=feature.correlation,
sql=feature.sql,
)
for feature in features
}
Comment thread
Urfoex marked this conversation as resolved.
21 changes: 21 additions & 0 deletions tests/integration/assertions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections.abc import Sequence
from pathlib import Path

from getml_io.getml.features import Features
from getml_io.metadata.container_information import ContainerInformation
from getml_io.metadata.dataframe_information import (
ColumnProfile,
Expand Down Expand Up @@ -208,3 +209,23 @@ def assert_pipeline_information(
assert pipeline_information.tags
assert pipeline_information.targets == expected_pipeline_information.targets
assert pipeline_information.data_model == expected_pipeline_information.data_model
assert_features(
pipeline_information.features,
expected_pipeline_information.features,
)


def assert_features(
features: Features,
expected_features: Features,
) -> None:
assert len(features) == len(expected_features)
assert features.keys() == expected_features.keys()
for feature_name, feature in features.items():
expected_feature = expected_features[feature_name]
assert feature.name == expected_feature.name
assert feature.index == expected_feature.index
assert feature.target == expected_feature.target
assert feature.sql is not None
assert feature.importance is not None
assert feature.correlation is not None
124 changes: 115 additions & 9 deletions tests/integration/data/loans/expected.pipeline.json
Original file line number Diff line number Diff line change
Expand Up @@ -631,10 +631,10 @@
"feature_learners": [
{
"aggregation": [
"AVG",
"SUM",
"MIN",
"AVG",
"MAX",
"MIN",
"COUNT"
],
"allow_sets": true,
Expand All @@ -649,18 +649,18 @@
"num_threads": 0,
"propositionalization": {
"aggregation": [
"COUNT DISTINCT",
"COUNT MINUS COUNT DISTINCT",
"AVG",
"SUM",
"AVG",
"MEDIAN",
"TREND",
"MIN",
"COUNT MINUS COUNT DISTINCT",
"MODE",
"STDDEV",
"LAST",
"MAX",
"MIN",
"COUNT DISTINCT",
"FIRST",
"MEDIAN",
"MODE",
"LAST",
"COUNT"
],
"delta_t": 0.0,
Expand Down Expand Up @@ -956,5 +956,111 @@
"parent": null
},
"peripheral": {}
},
"features": {
"feature_1_1": {
"name": "feature_1_1",
"index": 0,
"target": "default",
"importance": 0.1135676632264945,
"correlation": 0.490601623544188,
"sql": "DROP TABLE IF EXISTS \"FEATURE_1_1\";\n\nCREATE TABLE \"FEATURE_1_1\" AS\nSELECT MAX( t2.\"amount\" ) AS \"feature_1_1\",\n t1.rowid AS rownum\nFROM \"POPULATION__STAGING_TABLE_1\" t1\nINNER JOIN \"TRANS__STAGING_TABLE_4\" t2\nON t1.\"account_id\" = t2.\"account_id\"\nWHERE ( t2.\"date\" <= t1.\"date_loan\"\n) AND (\n ( ( t2.\"balance\" <= 3480.000000 ) AND ( t1.\"date_loan\" - t2.\"date\" <= 19958400.000000 ) AND ( t2.\"amount\" <= 20299.000000 ) AND ( t2.\"k_symbol\" NOT IN ( 'POJISTNE', 'SLUZBY', 'UROK', 'SANKC. UROK' ) OR t2.\"k_symbol\" IS NULL ) )\n)\nGROUP BY t1.rowid;"
},
"feature_1_2": {
"name": "feature_1_2",
"index": 1,
"target": "default",
"importance": 0.09923157995248899,
"correlation": 0.44092213849038303,
"sql": "DROP TABLE IF EXISTS \"FEATURE_1_2\";\n\nCREATE TABLE \"FEATURE_1_2\" AS\nSELECT SUM( t1.\"date_loan\" - t2.\"date\" ) AS \"feature_1_2\",\n t1.rowid AS rownum\nFROM \"POPULATION__STAGING_TABLE_1\" t1\nINNER JOIN \"TRANS__STAGING_TABLE_4\" t2\nON t1.\"account_id\" = t2.\"account_id\"\nWHERE ( t2.\"date\" <= t1.\"date_loan\"\n) AND (\n ( ( t2.\"k_symbol\" NOT IN ( 'SANKC. UROK' ) OR t2.\"k_symbol\" IS NULL ) AND ( t2.\"balance\" > 186780.000000 ) )\nOR ( ( t2.\"k_symbol\" IN ( 'SANKC. UROK' ) ) AND ( t1.\"frequency\" NOT IN ( 'POPLATEK MESICNE' ) OR t1.\"frequency\" IS NULL ) AND ( t2.\"amount\" > 4.000000 ) )\nOR ( ( t2.\"k_symbol\" IN ( 'SANKC. UROK' ) ) AND ( t1.\"frequency\" IN ( 'POPLATEK MESICNE' ) ) )\n)\nGROUP BY t1.rowid;"
},
"feature_1_3": {
"name": "feature_1_3",
"index": 2,
"target": "default",
"importance": 0.03450241472632771,
"correlation": -0.0021713705808225877,
"sql": "DROP TABLE IF EXISTS \"FEATURE_1_3\";\n\nCREATE TABLE \"FEATURE_1_3\" AS\nSELECT AVG( t2.\"a10\" ) AS \"feature_1_3\",\n t1.rowid AS rownum\nFROM \"POPULATION__STAGING_TABLE_1\" t1\nINNER JOIN \"META__STAGING_TABLE_2\" t2\nON t1.\"account_id\" = t2.\"account_id\"\nWHERE (\n ( ( t1.\"amount\" > 234442.000000 ) AND ( t2.\"type_card\" NOT IN ( 'gold', 'classic', 'junior' ) OR t2.\"type_card\" IS NULL ) AND ( t1.\"frequency\" IN ( 'POPLATEK MESICNE' ) ) AND ( t2.\"a9\" <= 9.000000 ) )\nOR ( ( t1.\"amount\" <= 234442.000000 OR t1.\"amount\" IS NULL ) AND ( t1.\"payments\" > 6912.000000 ) AND ( t2.\"a3\" NOT IN ( 'east Bohemia', 'south Moravia', 'north Bohemia', 'north Moravia' ) OR t2.\"a3\" IS NULL ) )\nOR ( ( t1.\"amount\" <= 234442.000000 OR t1.\"amount\" IS NULL ) AND ( t1.\"payments\" <= 6912.000000 OR t1.\"payments\" IS NULL ) AND ( t2.\"a12\" > 4.637500 ) AND ( t1.\"amount\" > 83764.000000 ) )\n)\nGROUP BY t1.rowid;"
},
"feature_1_4": {
"name": "feature_1_4",
"index": 3,
"target": "default",
"importance": 0.25317056825181045,
"correlation": 0.6216480344796319,
"sql": "DROP TABLE IF EXISTS \"FEATURE_1_4\";\n\nCREATE TABLE \"FEATURE_1_4\" AS\nSELECT COUNT( * ) AS \"feature_1_4\",\n t1.rowid AS rownum\nFROM \"POPULATION__STAGING_TABLE_1\" t1\nINNER JOIN \"TRANS__STAGING_TABLE_4\" t2\nON t1.\"account_id\" = t2.\"account_id\"\nWHERE ( t2.\"date\" <= t1.\"date_loan\"\n) AND (\n ( ( t2.\"balance\" > 13055.000000 OR t2.\"balance\" IS NULL ) AND ( t2.\"balance\" > 133677.000000 ) AND ( t1.\"frequency\" NOT IN ( 'POPLATEK MESICNE', 'POPLATEK PO OBRATU' ) OR t1.\"frequency\" IS NULL ) )\nOR ( ( t2.\"balance\" <= 13055.000000 ) AND ( t2.\"operation\" IN ( 'VYBER', 'VKLAD' ) ) AND ( t1.\"date_loan\" - t2.\"date\" <= 18423138.461538 ) AND ( t2.\"balance\" > -4311.000000 ) )\n)\nGROUP BY t1.rowid;"
},
"feature_1_5": {
"name": "feature_1_5",
"index": 4,
"target": "default",
"importance": 0.022781446408607083,
"correlation": 0.23609098488653527,
"sql": "DROP TABLE IF EXISTS \"FEATURE_1_5\";\n\nCREATE TABLE \"FEATURE_1_5\" AS\nSELECT MAX( t2.\"a10\" ) AS \"feature_1_5\",\n t1.rowid AS rownum\nFROM \"POPULATION__STAGING_TABLE_1\" t1\nINNER JOIN \"META__STAGING_TABLE_2\" t2\nON t1.\"account_id\" = t2.\"account_id\"\nWHERE (\n ( ( t1.\"payments\" > 6880.000000 ) AND ( t2.\"type_card\" NOT IN ( 'classic' ) OR t2.\"type_card\" IS NULL ) AND ( t1.\"payments\" > 8407.000000 OR t1.\"payments\" IS NULL ) AND ( t2.\"a3\" IN ( 'south Bohemia', 'north Moravia' ) ) )\nOR ( ( t1.\"payments\" > 6880.000000 ) AND ( t2.\"type_card\" NOT IN ( 'classic' ) OR t2.\"type_card\" IS NULL ) AND ( t1.\"payments\" <= 8407.000000 ) AND ( t2.\"a8\" <= 2.000000 ) )\n)\nGROUP BY t1.rowid;"
},
"feature_1_6": {
"name": "feature_1_6",
"index": 5,
"target": "default",
"importance": 0.1746793231004474,
"correlation": -0.18288971144527097,
"sql": "DROP TABLE IF EXISTS \"FEATURE_1_6\";\n\nCREATE TABLE \"FEATURE_1_6\" AS\nSELECT AVG( t2.\"a14\" ) AS \"feature_1_6\",\n t1.rowid AS rownum\nFROM \"POPULATION__STAGING_TABLE_1\" t1\nINNER JOIN \"META__STAGING_TABLE_2\" t2\nON t1.\"account_id\" = t2.\"account_id\"\nWHERE (\n ( ( t1.\"payments\" > 6874.000000 OR t1.\"payments\" IS NULL ) AND ( t2.\"type_card\" NOT IN ( 'gold', 'classic', 'junior' ) OR t2.\"type_card\" IS NULL ) AND ( t2.\"type_disp\" NOT IN ( 'OWNER' ) OR t2.\"type_disp\" IS NULL ) )\nOR ( ( t1.\"payments\" > 6874.000000 OR t1.\"payments\" IS NULL ) AND ( t2.\"type_card\" NOT IN ( 'gold', 'classic', 'junior' ) OR t2.\"type_card\" IS NULL ) AND ( t2.\"type_disp\" IN ( 'OWNER' ) ) AND ( t1.\"duration\" <= 20.000000 ) )\nOR ( ( t1.\"payments\" > 6874.000000 OR t1.\"payments\" IS NULL ) AND ( t2.\"type_card\" IN ( 'gold', 'classic', 'junior' ) ) )\nOR ( ( t1.\"payments\" <= 6874.000000 ) )\n)\nGROUP BY t1.rowid;"
},
"feature_1_7": {
"name": "feature_1_7",
"index": 6,
"target": "default",
"importance": 0.12502651911553847,
"correlation": -0.051941482060654416,
"sql": "DROP TABLE IF EXISTS \"FEATURE_1_7\";\n\nCREATE TABLE \"FEATURE_1_7\" AS\nSELECT MIN( t2.\"a11\" ) AS \"feature_1_7\",\n t1.rowid AS rownum\nFROM \"POPULATION__STAGING_TABLE_1\" t1\nINNER JOIN \"META__STAGING_TABLE_2\" t2\nON t1.\"account_id\" = t2.\"account_id\"\nWHERE (\n ( ( t1.\"payments\" > 7395.000000 OR t1.\"payments\" IS NULL ) AND ( t1.\"amount\" > 172870.000000 OR t1.\"amount\" IS NULL ) AND ( t2.\"a3\" NOT IN ( 'east Bohemia', 'south Moravia', 'north Bohemia', 'west Bohemia' ) OR t2.\"a3\" IS NULL ) AND ( t2.\"type_card\" IN ( 'classic' ) ) )\nOR ( ( t1.\"payments\" > 7395.000000 OR t1.\"payments\" IS NULL ) AND ( t1.\"amount\" > 172870.000000 OR t1.\"amount\" IS NULL ) AND ( t2.\"a3\" IN ( 'east Bohemia', 'south Moravia', 'north Bohemia', 'west Bohemia' ) ) AND ( t2.\"a12\" <= 3.850000 ) )\nOR ( ( t1.\"payments\" > 7395.000000 OR t1.\"payments\" IS NULL ) AND ( t1.\"amount\" <= 172870.000000 ) AND ( t2.\"a3\" NOT IN ( 'west Bohemia' ) OR t2.\"a3\" IS NULL ) )\nOR ( ( t1.\"payments\" <= 7395.000000 ) AND ( t2.\"a13\" > 8.901667 OR t2.\"a13\" IS NULL ) AND ( t2.\"gender\" NOT IN ( 'M' ) OR t2.\"gender\" IS NULL ) )\nOR ( ( t1.\"payments\" <= 7395.000000 ) AND ( t2.\"a13\" <= 8.901667 ) )\n)\nGROUP BY t1.rowid;"
},
"feature_1_8": {
"name": "feature_1_8",
"index": 7,
"target": "default",
"importance": 0.009308557459210479,
"correlation": -0.03337139710997413,
"sql": "DROP TABLE IF EXISTS \"FEATURE_1_8\";\n\nCREATE TABLE \"FEATURE_1_8\" AS\nSELECT AVG( t2.\"a5\" ) AS \"feature_1_8\",\n t1.rowid AS rownum\nFROM \"POPULATION__STAGING_TABLE_1\" t1\nINNER JOIN \"META__STAGING_TABLE_2\" t2\nON t1.\"account_id\" = t2.\"account_id\"\nWHERE (\n ( ( t1.\"payments\" > 7042.000000 ) AND ( t2.\"a9\" > 4.000000 OR t2.\"a9\" IS NULL ) AND ( t2.\"a3\" NOT IN ( 'north Moravia', 'west Bohemia' ) OR t2.\"a3\" IS NULL ) AND ( t2.\"a11\" > 9765.000000 ) )\nOR ( ( t1.\"payments\" > 7042.000000 ) AND ( t2.\"a9\" > 4.000000 OR t2.\"a9\" IS NULL ) AND ( t2.\"a3\" IN ( 'north Moravia', 'west Bohemia' ) ) )\nOR ( ( t1.\"payments\" > 7042.000000 ) AND ( t2.\"a9\" <= 4.000000 ) )\nOR ( ( t1.\"payments\" <= 7042.000000 OR t1.\"payments\" IS NULL ) AND ( t2.\"a14\" <= 95.000000 ) AND ( t2.\"a6\" > 25.000000 ) )\n)\nGROUP BY t1.rowid;"
},
"feature_1_9": {
"name": "feature_1_9",
"index": 8,
"target": "default",
"importance": 0.007941830306946887,
"correlation": -0.0232127074123776,
"sql": "DROP TABLE IF EXISTS \"FEATURE_1_9\";\n\nCREATE TABLE \"FEATURE_1_9\" AS\nSELECT AVG( t2.\"a13\" ) AS \"feature_1_9\",\n t1.rowid AS rownum\nFROM \"POPULATION__STAGING_TABLE_1\" t1\nINNER JOIN \"META__STAGING_TABLE_2\" t2\nON t1.\"account_id\" = t2.\"account_id\"\nWHERE (\n ( ( t2.\"a11\" > 8329.000000 OR t2.\"a11\" IS NULL ) AND ( t2.\"a4\" > 367714.000000 ) AND ( t2.\"type_card\" NOT IN ( 'classic' ) OR t2.\"type_card\" IS NULL ) )\nOR ( ( t2.\"a11\" <= 8329.000000 ) AND ( t1.\"amount\" > 235098.000000 ) AND ( t2.\"a3\" NOT IN ( 'south Moravia' ) OR t2.\"a3\" IS NULL ) )\nOR ( ( t2.\"a11\" <= 8329.000000 ) AND ( t1.\"amount\" <= 235098.000000 OR t1.\"amount\" IS NULL ) AND ( t2.\"a8\" > 2.000000 ) AND ( t1.\"duration\" <= 30.000000 ) )\n)\nGROUP BY t1.rowid;"
},
"feature_1_10": {
"name": "feature_1_10",
"index": 9,
"target": "default",
"importance": 0.0010942671416343636,
"correlation": 0.15435132513852295,
"sql": "DROP TABLE IF EXISTS \"FEATURE_1_10\";\n\nCREATE TABLE \"FEATURE_1_10\" AS\nSELECT MIN( t2.\"a12\" ) AS \"feature_1_10\",\n t1.rowid AS rownum\nFROM \"POPULATION__STAGING_TABLE_1\" t1\nINNER JOIN \"META__STAGING_TABLE_2\" t2\nON t1.\"account_id\" = t2.\"account_id\"\nWHERE (\n ( ( t1.\"payments\" > 7240.000000 ) AND ( t2.\"type_card\" NOT IN ( 'gold', 'classic' ) OR t2.\"type_card\" IS NULL ) AND ( t1.\"frequency\" IN ( 'POPLATEK MESICNE', 'POPLATEK TYDNE' ) ) AND ( t2.\"type_disp\" NOT IN ( 'DISPONENT' ) OR t2.\"type_disp\" IS NULL ) )\n)\nGROUP BY t1.rowid;"
},
"duration": {
"name": "duration",
"index": 10,
"target": "default",
"importance": 0.0005039782195063603,
"correlation": -0.0271918442877901,
"sql": ""
},
"payments": {
"name": "payments",
"index": 11,
"target": "default",
"importance": 0.0788924833005465,
"correlation": 0.10606259213768955,
"sql": ""
},
"amount": {
"name": "amount",
"index": 12,
"target": "default",
"importance": 0.07929936879044087,
"correlation": 0.06660193285904141,
"sql": ""
}
}
}
Loading
Loading