Skip to content

feat: Transform strings with respect to property schema when conforming properties #2997

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions singer_sdk/helpers/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import copy
import datetime
import decimal
import json
import logging
import math
import typing as t
Expand Down Expand Up @@ -542,9 +543,32 @@ def _conform_primitive_property( # noqa: PLR0911
# for BIT value, treat 0 as False and anything else as True
return elem != b"\x00" if is_boolean_type(property_schema) else elem.hex()
if isinstance(elem, (float, decimal.Decimal)):
if math.isnan(elem) or math.isinf(elem):
return None
return elem
return elem if math.isfinite(elem) else None
if isinstance(elem, str) and not is_string_type(property_schema):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(commenting here to start a thread)

It looks like it was slightly affected: https://codspeed.io/meltano/sdk/branches/ReubenFrankel%3Afeat%2Fconform-primitive-interpret-string?uri=tests%2Fcore%2Ftest_typing.py%3A%3Atest_bench_conform_record_data_types

Possibly due to this new check, which will check the property schema type for every str value:

if isinstance(elem, str) and not is_string_type(property_schema):

Do you think there's a way we could prevent this regression? Perhaps caching a mapping of property > is_.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I was thinking about some kind of type caching also. We did something like that here, or were you imagining it to be per-property?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, something like or like the selection mask we use internally here.

return _transform_string_property(elem, property_schema)
if _is_exclusive_boolean_type(property_schema):
return None if elem is None else elem != 0
return elem


def _transform_string_property( # noqa: PLR0911
elem: str,
property_schema: dict,
) -> t.Any: # noqa: ANN401
if not elem and is_null_type(property_schema):
return None # if nullable, None for empty string

if is_boolean_type(property_schema):
return (
elem.lower() == "true"
) # false for any non-"true" string (case-insensitive), including empty string
if is_integer_type(property_schema):
return int(elem or 0) # 0 for empty string
if is_number_type(property_schema):
d = decimal.Decimal(elem or 0) # 0 for empty string
return d if d.is_finite() else None
if is_array_type(property_schema):
return json.loads(elem) if elem else [] # empty array for empty string
if is_object_type(property_schema):
return json.loads(elem) if elem else {} # empty object for empty string
return elem
57 changes: 57 additions & 0 deletions tests/core/test_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from singer_sdk.helpers._typing import (
TypeConformanceLevel,
_conform_primitive_property,
_transform_string_property,
conform_record_data_types,
)
from singer_sdk.typing import (
Expand Down Expand Up @@ -357,12 +358,68 @@ def test_conform_object_additional_properties():
pytest.param(
decimal.Decimal("nan"), {"type": "number"}, None, id="decimal_nan_to_number"
),
pytest.param("", {"type": "string"}, "", id="string_empty_to_string"),
pytest.param(
"",
{"type": ["boolean", "null"]},
None,
id="string_empty_to_any_nullable_non_string",
),
pytest.param("true", {"type": "boolean"}, True, id="string_true_to_boolean"),
pytest.param(
"TRUE",
{"type": "boolean"},
True,
id="string_true_uppercase_to_boolean",
),
pytest.param("false", {"type": "boolean"}, False, id="string_false_to_boolean"),
pytest.param(
"something else",
{"type": "boolean"},
False,
id="string_not_true_to_boolean",
),
pytest.param("", {"type": "boolean"}, False, id="string_empty_to_boolean"),
pytest.param("3", {"type": "integer"}, 3, id="string_integer_to_integer"),
pytest.param("", {"type": "integer"}, 0, id="string_empty_to_integer"),
pytest.param(
"3.14",
{"type": "number"},
decimal.Decimal("3.14"),
id="string_float_to_number",
),
pytest.param("inf", {"type": "number"}, None, id="string_inf_to_number"),
pytest.param("nan", {"type": "number"}, None, id="string_nan_to_number"),
pytest.param(
"",
{"type": "number"},
decimal.Decimal(0),
id="string_empty_to_number",
),
pytest.param(
"[1, 2, 3]",
{"type": "array"},
[1, 2, 3],
id="string_json_array_to_array",
),
pytest.param("", {"type": "array"}, [], id="string_empty_to_array"),
pytest.param(
'{"a": 1, "b": true, "c": 3.14}',
{"type": "object"},
{"a": 1, "b": True, "c": 3.14},
id="string_json_object_to_object",
),
pytest.param("", {"type": "object"}, {}, id="string_empty_to_object"),
],
)
def test_conform_primitives(value: t.Any, type_dict: dict, expected: t.Any):
assert _conform_primitive_property(value, type_dict) == expected


def test_transform_string_to_string():
assert _transform_string_property("test", {"type": "string"}) == "test"


@pytest.mark.filterwarnings("ignore:Use `JSONSchemaToSQL` instead.:DeprecationWarning")
@pytest.mark.parametrize(
"jsonschema_type,expected",
Expand Down