Skip to content

Commit fd6ce34

Browse files
committed
fix(ingest): handle tuple and enum keys in Report.to_pure_python_obj()
DataHub's Report.to_pure_python_obj() method failed when report dataclasses contained dictionaries with tuple or enum keys. This occurred because: 1. dataclasses.asdict() raises TypeError when dict fields have enum keys 2. The dict comprehension in to_pure_python_obj() didn't handle tuple/enum keys The fix: - Manually iterate dataclass fields instead of using dataclasses.asdict() - Explicitly convert tuple keys to their string representation - Convert enum keys to their values for better JSON readability - Recursively process results from SupportsAsObj.as_obj() This enables IngestionStageReport to serialize correctly when it has enum keys in ingestion_high_stage_seconds dict field. Note: A previous fix (commit 7b5680e) worked around this in ingestion_stage.py by converting tuple keys to strings at the source. This fix handles the problem at the framework level, making all reports robust against non-string dict keys.
1 parent ba7842d commit fd6ce34

File tree

2 files changed

+259
-7
lines changed

2 files changed

+259
-7
lines changed

metadata-ingestion/src/datahub/ingestion/api/report.py

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,24 @@ def to_pure_python_obj(some_val: Any) -> Any:
6363
"""A cheap way to generate a dictionary."""
6464

6565
if isinstance(some_val, SupportsAsObj):
66-
return some_val.as_obj()
66+
# Call as_obj() and recursively process the result
67+
# This handles cases like TopKDict which may return dicts with tuple/enum keys
68+
return Report.to_pure_python_obj(some_val.as_obj())
6769
elif isinstance(some_val, pydantic.BaseModel):
6870
return Report.to_pure_python_obj(some_val.model_dump())
6971
elif dataclasses.is_dataclass(some_val) and not isinstance(some_val, type):
7072
# The `is_dataclass` function returns `True` for both instances and classes.
7173
# We need an extra check to ensure an instance was passed in.
7274
# https://docs.python.org/3/library/dataclasses.html#dataclasses.is_dataclass
73-
return dataclasses.asdict(some_val)
75+
76+
# Manually convert dataclass fields to handle dicts with enum/tuple keys
77+
# dataclasses.asdict() fails when dict fields have enum keys
78+
result = {}
79+
for field in dataclasses.fields(some_val):
80+
field_value = getattr(some_val, field.name)
81+
# Recursively convert field values (which will handle dicts with enum keys)
82+
result[field.name] = Report.to_pure_python_obj(field_value)
83+
return result
7484
elif isinstance(some_val, list):
7585
return [Report.to_pure_python_obj(v) for v in some_val if v is not None]
7686
elif isinstance(some_val, timedelta):
@@ -82,11 +92,25 @@ def to_pure_python_obj(some_val: Any) -> Any:
8292
# we don't want to fail reporting because we were unable to pretty print a timestamp
8393
return str(datetime)
8494
elif isinstance(some_val, dict):
85-
return {
86-
Report.to_str(k): Report.to_pure_python_obj(v)
87-
for k, v in some_val.items()
88-
if v is not None
89-
}
95+
result = {}
96+
for k, v in some_val.items():
97+
if v is None:
98+
continue
99+
100+
# Handle tuple keys - JSON doesn't support tuple keys, convert to string
101+
if isinstance(k, tuple):
102+
key = str(k)
103+
# Handle enum keys - use the enum's value for better readability
104+
elif isinstance(k, Enum):
105+
key = k.value if hasattr(k, "value") else str(k)
106+
# Use existing to_str logic for other types
107+
else:
108+
key = Report.to_str(k)
109+
110+
# Recursively convert values
111+
result[key] = Report.to_pure_python_obj(v)
112+
113+
return result
90114
elif isinstance(some_val, (int, float, bool)):
91115
return some_val
92116
else:
Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
"""Tests for Report serialization with various key types."""
2+
3+
import json
4+
import time
5+
from enum import Enum
6+
7+
from datahub.ingestion.api.report import Report
8+
from datahub.ingestion.source_report.ingestion_stage import (
9+
IngestionHighStage,
10+
IngestionStageReport,
11+
)
12+
from datahub.utilities.stats_collections import TopKDict
13+
14+
15+
class TestReportTupleKeys:
16+
"""Test Report.to_pure_python_obj() with tuple keys."""
17+
18+
def test_simple_tuple_keys(self):
19+
"""Test that simple tuple keys are converted to strings."""
20+
input_dict = {("a", "b"): 1, ("c", "d"): 2}
21+
result = Report.to_pure_python_obj(input_dict)
22+
23+
# Tuple keys should be converted to string representation
24+
assert "('a', 'b')" in result
25+
assert result["('a', 'b')"] == 1
26+
assert "('c', 'd')" in result
27+
assert result["('c', 'd')"] == 2
28+
29+
# Result should be JSON serializable
30+
json.dumps(result)
31+
32+
def test_nested_tuple_keys(self):
33+
"""Test that nested dicts with tuple keys are handled."""
34+
input_dict = {
35+
"outer": {("inner", "tuple"): "value"},
36+
("outer", "tuple"): {"inner": "value2"},
37+
}
38+
result = Report.to_pure_python_obj(input_dict)
39+
40+
# Both levels should have tuple keys converted
41+
assert "outer" in result
42+
assert "('inner', 'tuple')" in result["outer"]
43+
assert "('outer', 'tuple')" in result
44+
45+
# Should be JSON serializable
46+
json.dumps(result)
47+
48+
def test_topkdict_with_tuple_keys(self):
49+
"""Test TopKDict with tuple keys."""
50+
tk: TopKDict = TopKDict()
51+
tk[("stage1", "substage1")] = 1.5
52+
tk[("stage2", "substage2")] = 2.5
53+
54+
result = Report.to_pure_python_obj(tk)
55+
56+
# TopKDict.as_obj() returns a regular dict, then our code converts tuple keys
57+
assert isinstance(result, dict)
58+
assert "('stage1', 'substage1')" in result
59+
assert result["('stage1', 'substage1')"] == 1.5
60+
61+
# Should be JSON serializable
62+
json.dumps(result)
63+
64+
65+
class TestReportEnumKeys:
66+
"""Test Report.to_pure_python_obj() with enum keys."""
67+
68+
def test_enum_keys_converted_to_values(self):
69+
"""Test that enum keys are converted to their values."""
70+
71+
class TestEnum(Enum):
72+
VALUE1 = "value1"
73+
VALUE2 = "value2"
74+
75+
input_dict = {TestEnum.VALUE1: 1, TestEnum.VALUE2: 2}
76+
result = Report.to_pure_python_obj(input_dict)
77+
78+
# Enum keys should be converted to their string values
79+
assert "value1" in result
80+
assert result["value1"] == 1
81+
assert "value2" in result
82+
assert result["value2"] == 2
83+
84+
# Should be JSON serializable
85+
json.dumps(result)
86+
87+
def test_ingestion_high_stage_enum_keys(self):
88+
"""Test IngestionHighStage enum keys (actual use case)."""
89+
input_dict = {
90+
IngestionHighStage._UNDEFINED: 10.5,
91+
IngestionHighStage.PROFILING: 5.2,
92+
}
93+
result = Report.to_pure_python_obj(input_dict)
94+
95+
# Should have enum values as keys
96+
assert "Ingestion" in result # IngestionHighStage._UNDEFINED.value
97+
assert "Profiling" in result # IngestionHighStage.PROFILING.value
98+
99+
# Should be JSON serializable
100+
json.dumps(result)
101+
102+
103+
class TestIngestionStageReportSerialization:
104+
"""Integration tests with actual IngestionStageReport."""
105+
106+
def test_ingestion_stage_report_json_serializable(self):
107+
"""Test that IngestionStageReport produces JSON-serializable output via Report.to_pure_python_obj()."""
108+
report = IngestionStageReport()
109+
110+
# Create some stage data (simulates real usage)
111+
with report.new_stage("Test Stage 1"):
112+
time.sleep(0.01)
113+
with report.new_stage("Test Stage 2"):
114+
time.sleep(0.01)
115+
116+
# Convert using Report.to_pure_python_obj (IngestionStageReport is a dataclass, not a Report)
117+
report_dict = Report.to_pure_python_obj(report)
118+
119+
# Should be JSON serializable (this was failing before the fix)
120+
json_str = json.dumps(report_dict)
121+
assert json_str
122+
123+
# Verify structure
124+
parsed = json.loads(json_str)
125+
assert "ingestion_stage_durations" in parsed
126+
assert "ingestion_high_stage_seconds" in parsed
127+
128+
def test_ingestion_stage_durations_have_string_keys(self):
129+
"""Test that ingestion_stage_durations keys are strings after conversion."""
130+
report = IngestionStageReport()
131+
132+
with report.new_stage("Test Stage"):
133+
time.sleep(0.01)
134+
135+
report_dict = Report.to_pure_python_obj(report)
136+
137+
# All keys should be strings
138+
for key in report_dict.get("ingestion_stage_durations", {}):
139+
assert isinstance(key, str), f"Key should be string, got {type(key)}"
140+
141+
def test_high_stage_seconds_have_string_keys(self):
142+
"""Test that ingestion_high_stage_seconds keys are strings after conversion."""
143+
report = IngestionStageReport()
144+
report.ingestion_high_stage_seconds[IngestionHighStage.PROFILING] = 10.0
145+
146+
report_dict = Report.to_pure_python_obj(report)
147+
148+
# All keys should be strings (enum values)
149+
for key in report_dict.get("ingestion_high_stage_seconds", {}):
150+
assert isinstance(key, str), f"Key should be string, got {type(key)}"
151+
152+
153+
class TestBackwardCompatibility:
154+
"""Ensure fix doesn't break existing functionality."""
155+
156+
def test_existing_string_keys_unchanged(self):
157+
"""Test that existing behavior with string keys is preserved."""
158+
input_dict = {"key1": "value1", "key2": 2, "key3": {"nested": "value"}}
159+
result = Report.to_pure_python_obj(input_dict)
160+
161+
assert result == input_dict
162+
json.dumps(result)
163+
164+
def test_existing_int_keys_converted_to_strings(self):
165+
"""Test that int keys are converted to strings (for JSON compatibility)."""
166+
input_dict = {1: "one", 2: "two", 3: "three"}
167+
result = Report.to_pure_python_obj(input_dict)
168+
169+
# Integer keys are converted to strings for JSON compatibility
170+
assert "1" in result
171+
assert "2" in result
172+
assert "3" in result
173+
json.dumps(result)
174+
175+
def test_mixed_key_types(self):
176+
"""Test dict with mixed key types."""
177+
input_dict = {
178+
"string": 1,
179+
123: 2,
180+
("tuple", "key"): 3,
181+
}
182+
result = Report.to_pure_python_obj(input_dict)
183+
184+
assert "string" in result
185+
assert "123" in result # Integer keys are converted to strings
186+
assert "('tuple', 'key')" in result
187+
json.dumps(result)
188+
189+
190+
class TestEdgeCases:
191+
"""Test edge cases and error handling."""
192+
193+
def test_empty_dict(self):
194+
"""Test converting empty dictionary."""
195+
assert Report.to_pure_python_obj({}) == {}
196+
197+
def test_none_values_filtered(self):
198+
"""Test that None values are filtered out."""
199+
input_dict = {"key1": "value1", "key2": None, ("tuple",): "value2"}
200+
result = Report.to_pure_python_obj(input_dict)
201+
202+
assert "key1" in result
203+
assert "key2" not in result
204+
assert "('tuple',)" in result
205+
206+
def test_nested_tuples_in_keys(self):
207+
"""Test tuples containing tuples as keys."""
208+
input_dict = {(("nested",), "tuple"): "value"}
209+
result = Report.to_pure_python_obj(input_dict)
210+
211+
# Should convert to string representation
212+
assert "(('nested',), 'tuple')" in result
213+
json.dumps(result)
214+
215+
def test_enum_without_value_attribute(self):
216+
"""Test enum keys without explicit value attribute."""
217+
218+
class IntEnum(Enum):
219+
FIRST = 1
220+
SECOND = 2
221+
222+
input_dict = {IntEnum.FIRST: "first", IntEnum.SECOND: "second"}
223+
result = Report.to_pure_python_obj(input_dict)
224+
225+
# Should use the value
226+
assert 1 in result
227+
assert 2 in result
228+
json.dumps(result)

0 commit comments

Comments
 (0)