⚡️ Speed up function serialize_data by 10%
#46
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
📄 10% (0.10x) speedup for
serialize_dataincognee/infrastructure/databases/vector/pgvector/serialize_data.py⏱️ Runtime :
5.24 milliseconds→4.78 milliseconds(best of250runs)📝 Explanation and details
The optimization achieves a 9% speedup by replacing
isinstance()checks withtype()comparisons and caching thedict.items()method call.Key optimizations:
type()vsisinstance()replacement: Changedisinstance(data, dict)totype(data) is dictfor all type checks. Thetype()comparison is faster because it performs a direct type equality check rather than traversing the method resolution order to handle inheritance. Since this function deals with exact type matching for serialization (not subclass polymorphism), this optimization is safe and effective.Method call caching: Extracted
items = data.items()outside the dictionary comprehension to avoid repeated method lookups during the comprehension execution.Performance impact by test case type:
isinstance()overheadtype()checks still need to be performed even for empty collectionsThe optimization is particularly effective for workloads with many datetime/UUID objects or large collections, where the cumulative savings from faster type checking become significant. The reordering of checks (datetime/UUID first) also optimizes for the most transformation-heavy operations.
✅ Correctness verification report:
🌀 Generated Regression Tests and Runtime
from datetime import datetime, timedelta
from uuid import UUID, uuid4
imports
import pytest
from cognee.infrastructure.databases.vector.pgvector.serialize_data import
serialize_data
unit tests
--------------------
1. Basic Test Cases
--------------------
def test_basic_primitive_types():
# Should return unchanged for int, float, str, bool, None
codeflash_output = serialize_data(42) # 488ns -> 529ns (7.75% slower)
codeflash_output = serialize_data(3.14) # 238ns -> 229ns (3.93% faster)
codeflash_output = serialize_data("hello") # 204ns -> 174ns (17.2% faster)
codeflash_output = serialize_data(True) # 304ns -> 168ns (81.0% faster)
codeflash_output = serialize_data(None) # 203ns -> 165ns (23.0% faster)
def test_basic_datetime_serialization():
# Should convert datetime to ISO 8601 string
dt = datetime(2024, 6, 1, 12, 34, 56)
codeflash_output = serialize_data(dt) # 2.26μs -> 2.31μs (2.16% slower)
def test_basic_uuid_serialization():
# Should convert UUID to string
u = UUID("12345678123456781234567812345678")
codeflash_output = serialize_data(u) # 2.18μs -> 2.55μs (14.5% slower)
def test_basic_list_serialization():
# Should serialize lists with primitives
data = [1, "a", 3.5, None]
codeflash_output = serialize_data(data) # 1.65μs -> 1.63μs (1.29% faster)
def test_basic_dict_serialization():
# Should serialize dicts with primitives
data = {"a": 1, "b": "x", "c": None}
codeflash_output = serialize_data(data) # 1.79μs -> 1.70μs (5.37% faster)
def test_nested_dict_and_list():
# Should recursively serialize nested dicts and lists
dt = datetime(2024, 6, 1, 12, 0, 0)
u = uuid4()
data = {
"name": "test",
"created": dt,
"ids": [u, 42, {"dt": dt}],
"meta": {"uuid": u, "active": True}
}
expected = {
"name": "test",
"created": dt.isoformat(),
"ids": [str(u), 42, {"dt": dt.isoformat()}],
"meta": {"uuid": str(u), "active": True}
}
codeflash_output = serialize_data(data) # 6.72μs -> 6.73μs (0.119% slower)
--------------------
2. Edge Test Cases
--------------------
def test_empty_list_and_dict():
# Should handle empty list and dict
codeflash_output = serialize_data([]) # 689ns -> 795ns (13.3% slower)
codeflash_output = serialize_data({}) # 632ns -> 780ns (19.0% slower)
def test_list_of_empty_dicts():
# Should handle list of empty dicts
codeflash_output = serialize_data([{}, {}, {}]) # 1.73μs -> 2.08μs (16.8% slower)
def test_dict_with_empty_list_and_dict():
# Should handle dict with empty list and dict values
codeflash_output = serialize_data({"a": [], "b": {}}) # 1.72μs -> 2.02μs (14.9% slower)
def test_datetime_with_timezone():
# Should serialize timezone-aware datetime correctly
import pytz
tz = pytz.timezone("US/Eastern")
dt = datetime(2024, 6, 1, 12, 0, 0, tzinfo=tz)
# The output should match dt.isoformat()
codeflash_output = serialize_data(dt) # 5.61μs -> 5.48μs (2.39% faster)
def test_uuid_edge_cases():
# Should handle UUID version 1, 3, 4, 5
for u in [UUID(int=0), uuid4(), UUID("12345678-1234-5678-1234-567812345678")]:
codeflash_output = serialize_data(u) # 4.48μs -> 4.35μs (2.99% faster)
def test_dict_with_non_string_keys():
# Should preserve non-string keys (although not JSON-serializable)
data = {1: "a", (2, 3): "b"}
codeflash_output = serialize_data(data) # 1.51μs -> 1.66μs (8.46% slower)
def test_list_with_various_types():
# Should serialize each element appropriately
dt = datetime(2024, 6, 1, 12, 0, 0)
u = uuid4()
data = [dt, u, 123, "abc", None, [dt, u]]
expected = [dt.isoformat(), str(u), 123, "abc", None, [dt.isoformat(), str(u)]]
codeflash_output = serialize_data(data) # 5.37μs -> 5.04μs (6.47% faster)
def test_deeply_nested_structures():
# Should handle deep nesting up to reasonable recursion depth
dt = datetime(2024, 6, 1, 12, 0, 0)
u = uuid4()
data = {"a": [{"b": [{"c": [{"d": [dt, u]}]}]}]}
expected = {"a": [{"b": [{"c": [{"d": [dt.isoformat(), str(u)]}]}]}]}
codeflash_output = serialize_data(data) # 5.41μs -> 5.80μs (6.66% slower)
def test_preserves_mutable_reference_types():
# Should not mutate the input data
dt = datetime(2024, 6, 1, 12, 0, 0)
u = uuid4()
data = {"dt": dt, "u": u}
orig = data.copy()
serialize_data(data) # 4.09μs -> 4.28μs (4.44% slower)
def test_handles_unknown_custom_object():
# Should return object as-is if not datetime, UUID, dict, or list
class Dummy:
pass
d = Dummy()
codeflash_output = serialize_data(d) # 571ns -> 487ns (17.2% faster)
def test_handles_tuple_and_set():
# Should not serialize tuple or set (should return as-is)
tup = (1, 2, 3)
st = {1, 2, 3}
codeflash_output = serialize_data(tup) # 567ns -> 431ns (31.6% faster)
codeflash_output = serialize_data(st) # 301ns -> 274ns (9.85% faster)
-----------------------------
3. Large Scale Test Cases
-----------------------------
def test_large_list_of_datetimes():
# Should serialize a large list of datetimes efficiently
dts = [datetime(2024, 1, 1) + timedelta(days=i) for i in range(1000)]
codeflash_output = serialize_data(dts); result = codeflash_output # 575μs -> 516μs (11.4% faster)
for i, val in enumerate(result):
pass
def test_large_dict_of_uuids():
# Should serialize a large dict of UUIDs efficiently
uuids = {str(i): uuid4() for i in range(1000)}
codeflash_output = serialize_data(uuids); result = codeflash_output # 567μs -> 504μs (12.5% faster)
for k, v in result.items():
pass
def test_large_nested_structure():
# Should handle nested lists/dicts with up to 1000 elements
dt = datetime(2024, 6, 1, 12, 0, 0)
u = uuid4()
data = [{"x": dt, "y": [u for _ in range(10)]} for _ in range(100)]
codeflash_output = serialize_data(data); result = codeflash_output # 663μs -> 610μs (8.79% faster)
for item in result:
pass
def test_large_mixed_structure():
# Should handle a mix of datetimes, UUIDs, and primitives in a large nested structure
dt = datetime(2024, 6, 1, 12, 0, 0)
u = uuid4()
data = {
"dates": [dt for _ in range(500)],
"uuids": [u for _ in range(500)],
"nested": [{"d": dt, "u": u, "val": i} for i in range(100)]
}
codeflash_output = serialize_data(data); result = codeflash_output # 721μs -> 650μs (11.1% faster)
for i, item in enumerate(result["nested"]):
pass
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
#------------------------------------------------
from datetime import datetime, timedelta, timezone
from uuid import UUID, uuid4
imports
import pytest # used for our unit tests
from cognee.infrastructure.databases.vector.pgvector.serialize_data import
serialize_data
unit tests
------------------ BASIC TEST CASES ------------------
def test_serialize_int():
# Should return integer unchanged
codeflash_output = serialize_data(42) # 562ns -> 471ns (19.3% faster)
def test_serialize_float():
# Should return float unchanged
codeflash_output = serialize_data(3.14) # 433ns -> 452ns (4.20% slower)
def test_serialize_str():
# Should return string unchanged
codeflash_output = serialize_data("hello") # 411ns -> 413ns (0.484% slower)
def test_serialize_bool():
# Should return boolean unchanged
codeflash_output = serialize_data(True) # 507ns -> 431ns (17.6% faster)
codeflash_output = serialize_data(False) # 255ns -> 222ns (14.9% faster)
def test_serialize_none():
# Should return None unchanged
codeflash_output = serialize_data(None) # 414ns -> 436ns (5.05% slower)
def test_serialize_datetime():
# Should convert datetime to ISO 8601 string
dt = datetime(2023, 6, 1, 12, 30, 45)
codeflash_output = serialize_data(dt) # 2.42μs -> 2.51μs (3.59% slower)
def test_serialize_uuid():
# Should convert UUID to string
u = uuid4()
codeflash_output = serialize_data(u) # 2.58μs -> 2.60μs (0.884% slower)
def test_serialize_list_of_ints():
# Should return list unchanged
codeflash_output = serialize_data([1, 2, 3]) # 1.54μs -> 1.56μs (1.41% slower)
def test_serialize_list_of_mixed_types():
# Should serialize each element appropriately
dt = datetime(2023, 6, 1, 12, 30, 45)
u = uuid4()
codeflash_output = serialize_data([dt, u, 5, "a"]); result = codeflash_output # 5.61μs -> 5.56μs (0.845% faster)
def test_serialize_dict_of_mixed_types():
# Should serialize each value appropriately
dt = datetime(2023, 6, 1, 12, 30, 45)
u = uuid4()
d = {"date": dt, "id": u, "num": 7, "text": "hi"}
expected = {"date": dt.isoformat(), "id": str(u), "num": 7, "text": "hi"}
codeflash_output = serialize_data(d) # 3.79μs -> 3.66μs (3.44% faster)
def test_serialize_nested_dict():
# Should recursively serialize nested dicts
dt = datetime(2023, 6, 1, 12, 30, 45)
u = uuid4()
d = {"outer": {"date": dt, "id": u}}
expected = {"outer": {"date": dt.isoformat(), "id": str(u)}}
codeflash_output = serialize_data(d) # 3.62μs -> 3.68μs (1.47% slower)
def test_serialize_nested_list():
# Should recursively serialize nested lists
dt = datetime(2023, 6, 1, 12, 30, 45)
u = uuid4()
l = [dt, [u, [42, "x"]]]
expected = [dt.isoformat(), [str(u), [42, "x"]]]
codeflash_output = serialize_data(l) # 3.91μs -> 3.88μs (0.928% faster)
def test_serialize_dict_with_list_values():
# Should serialize lists inside dicts
dt = datetime(2023, 6, 1, 12, 30, 45)
u = uuid4()
d = {"dates": [dt, dt], "ids": [u, u]}
expected = {"dates": [dt.isoformat(), dt.isoformat()], "ids": [str(u), str(u)]}
codeflash_output = serialize_data(d) # 5.13μs -> 5.15μs (0.369% slower)
------------------ EDGE TEST CASES ------------------
def test_serialize_empty_list():
# Should handle empty list
codeflash_output = serialize_data([]) # 690ns -> 798ns (13.5% slower)
def test_serialize_empty_dict():
# Should handle empty dict
codeflash_output = serialize_data({}) # 787ns -> 958ns (17.8% slower)
def test_serialize_dict_with_empty_list():
# Should handle dict with empty list value
codeflash_output = serialize_data({"a": []}) # 1.31μs -> 1.57μs (16.5% slower)
def test_serialize_list_with_empty_dict():
# Should handle list with empty dict
codeflash_output = serialize_data([{}]) # 1.29μs -> 1.54μs (16.3% slower)
def test_serialize_deeply_nested_structure():
# Should handle deeply nested dicts/lists
dt = datetime(2023, 6, 1, 12, 30, 45)
u = uuid4()
data = {"a": [{"b": {"c": [dt, {"d": u}]}}]}
expected = {"a": [{"b": {"c": [dt.isoformat(), {"d": str(u)}]}}]}
codeflash_output = serialize_data(data) # 5.28μs -> 5.53μs (4.59% slower)
def test_serialize_datetime_with_timezone():
# Should serialize datetime with timezone info correctly
dt = datetime(2023, 6, 1, 12, 30, 45, tzinfo=timezone.utc)
codeflash_output = serialize_data(dt) # 4.15μs -> 4.21μs (1.35% slower)
def test_serialize_uuid_as_string():
# Should not double-serialize string UUIDs
u = str(uuid4())
codeflash_output = serialize_data(u) # 431ns -> 478ns (9.83% slower)
def test_serialize_dict_with_non_str_keys():
# Should preserve non-str keys (Python dicts allow any hashable type as keys)
d = {1: "one", (2, 3): "tuple", uuid4(): "uuid"}
# Only values are serialized, keys remain unchanged
codeflash_output = serialize_data(d); result = codeflash_output # 1.96μs -> 2.08μs (6.01% slower)
for k, v in d.items():
codeflash_output = serialize_data(v) # 728ns -> 613ns (18.8% faster)
def test_serialize_list_with_none():
# Should preserve None in lists
codeflash_output = serialize_data([None, 1, "a"]) # 1.46μs -> 1.39μs (4.96% faster)
def test_serialize_dict_with_none_values():
# Should preserve None as dict values
codeflash_output = serialize_data({"x": None, "y": 1}) # 1.47μs -> 1.54μs (4.54% slower)
def test_serialize_custom_object():
# Should return custom objects unchanged
class MyObj:
pass
obj = MyObj()
codeflash_output = serialize_data(obj) # 534ns -> 463ns (15.3% faster)
def test_serialize_tuple():
# Should return tuple unchanged (not supported for serialization)
tup = (1, 2, 3)
codeflash_output = serialize_data(tup) # 576ns -> 458ns (25.8% faster)
def test_serialize_set():
# Should return set unchanged (not supported for serialization)
s = {1, 2, 3}
codeflash_output = serialize_data(s) # 442ns -> 421ns (4.99% faster)
def test_serialize_bytes():
# Should return bytes unchanged
b = b"abc"
codeflash_output = serialize_data(b) # 546ns -> 437ns (24.9% faster)
def test_serialize_dict_with_various_types():
# Dict with all supported and unsupported types
dt = datetime(2023, 6, 1, 12, 30, 45)
u = uuid4()
d = {
"dt": dt,
"uuid": u,
"int": 5,
"float": 3.14,
"str": "hello",
"none": None,
"tuple": (1, 2),
"set": {1, 2},
"bytes": b"abc"
}
expected = {
"dt": dt.isoformat(),
"uuid": str(u),
"int": 5,
"float": 3.14,
"str": "hello",
"none": None,
"tuple": (1, 2),
"set": {1, 2},
"bytes": b"abc"
}
codeflash_output = serialize_data(d) # 5.24μs -> 4.55μs (15.1% faster)
------------------ LARGE SCALE TEST CASES ------------------
def test_serialize_large_list_of_datetimes():
# Should handle large list of datetime objects efficiently
dts = [datetime(2023, 6, 1, 12, 30, 45) for _ in range(500)]
expected = [dt.isoformat() for dt in dts]
codeflash_output = serialize_data(dts) # 283μs -> 253μs (11.8% faster)
def test_serialize_large_list_of_uuids():
# Should handle large list of UUID objects efficiently
uuids = [uuid4() for _ in range(500)]
expected = [str(u) for u in uuids]
codeflash_output = serialize_data(uuids) # 261μs -> 230μs (13.0% faster)
def test_serialize_large_dict_of_mixed_types():
# Should handle large dict with mixed types
dt = datetime(2023, 6, 1, 12, 30, 45)
u = uuid4()
d = {f"key{i}": [dt, u, i] for i in range(500)}
expected = {f"key{i}": [dt.isoformat(), str(u), i] for i in range(500)}
codeflash_output = serialize_data(d) # 803μs -> 756μs (6.18% faster)
def test_serialize_large_nested_structure():
# Should handle large, deeply nested structure
dt = datetime(2023, 6, 1, 12, 30, 45)
u = uuid4()
# Nested lists of dicts, each with a datetime and uuid
data = [{"dt": dt, "id": u, "vals": [i, dt, u]} for i in range(200)]
expected = [{"dt": dt.isoformat(), "id": str(u), "vals": [i, dt.isoformat(), str(u)]} for i in range(200)]
codeflash_output = serialize_data(data) # 626μs -> 593μs (5.50% faster)
def test_serialize_large_mixed_list():
# Should handle large mixed-type list
dt = datetime(2023, 6, 1, 12, 30, 45)
u = uuid4()
data = [dt if i % 3 == 0 else u if i % 3 == 1 else i for i in range(600)]
expected = [dt.isoformat() if i % 3 == 0 else str(u) if i % 3 == 1 else i for i in range(600)]
codeflash_output = serialize_data(data) # 267μs -> 238μs (12.3% faster)
def test_serialize_large_dict_with_large_lists():
# Dict with large lists as values
dt = datetime(2023, 6, 1, 12, 30, 45)
u = uuid4()
d = {
"dates": [dt for _ in range(300)],
"uuids": [u for _ in range(300)],
"ints": [i for i in range(300)]
}
expected = {
"dates": [dt.isoformat() for _ in range(300)],
"uuids": [str(u) for _ in range(300)],
"ints": [i for i in range(300)]
}
codeflash_output = serialize_data(d) # 367μs -> 322μs (13.9% faster)
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
To edit these changes
git checkout codeflash/optimize-serialize_data-mhts0c9oand push.