From 2461c1be3996abbcf2b43e0729a3cb543c76364c Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Tue, 21 Oct 2025 22:15:47 +0000 Subject: [PATCH] Optimize convert_node_to_data_point The optimization introduces a **module-level cache** (`_SUBCLASS_CACHE`) that eliminates the expensive repeated traversal of the class hierarchy. **Key Performance Problem**: The original code called `get_all_subclasses(cls)` on every invocation of `find_subclass_by_name`, which recursively walks the entire subclass tree. The line profiler shows this accounts for 78.8% of the execution time (1,155 hits calling `get_all_subclasses`). **Optimization Strategy**: - **Lazy caching**: Build a dictionary mapping class names to subclasses only once per base class - **O(1) lookup**: Replace linear search through subclasses with dictionary lookup using `cache.get(name, None)` **Performance Impact**: - Original: 1,155 calls to iterate through subclasses (78.8% of time) - Optimized: Only 1 call to `get_all_subclasses` to build the cache, then O(1) dictionary lookups **Test Case Performance**: The optimization is most effective for scenarios with **unknown or invalid type names** (667-800% speedup), where the original code would traverse the entire subclass hierarchy before returning `None`. Valid type lookups also benefit significantly from the O(1) dictionary access pattern. This caching approach scales particularly well when the same base class is used repeatedly, as subsequent calls avoid the expensive recursive traversal entirely. --- .../graph/utils/convert_node_to_data_point.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cognee/modules/graph/utils/convert_node_to_data_point.py b/cognee/modules/graph/utils/convert_node_to_data_point.py index fcae0cca13..3f7f667394 100644 --- a/cognee/modules/graph/utils/convert_node_to_data_point.py +++ b/cognee/modules/graph/utils/convert_node_to_data_point.py @@ -1,9 +1,10 @@ from cognee.infrastructure.engine import DataPoint +_SUBCLASS_CACHE = {} + def convert_node_to_data_point(node_data: dict) -> DataPoint: subclass = find_subclass_by_name(DataPoint, node_data["type"]) - return subclass(**node_data) @@ -17,8 +18,10 @@ def get_all_subclasses(cls): def find_subclass_by_name(cls, name): - for subclass in get_all_subclasses(cls): - if subclass.__name__ == name: - return subclass - - return None + # Use cached mapping for efficiency + cache = _SUBCLASS_CACHE.get(cls) + if cache is None: + # Build dict mapping class names to subclasses + cache = {subclass.__name__: subclass for subclass in get_all_subclasses(cls)} + _SUBCLASS_CACHE[cls] = cache + return cache.get(name, None)