s-heppner
diff --git a/‎semantic_matcher/algorithm.py
Lines changed: 129 additions & 0 deletions b/‎semantic_matcher/algorithm.py
Lines changed: 129 additions & 0 deletions
diff --git a/‎semantic_matcher/model.py
Lines changed: 155 additions & 4 deletions b/‎semantic_matcher/model.py
Lines changed: 155 additions & 4 deletions
@@ -0,0 +1,129 @@
+from typing import List, Tuple
+import heapq
+
+import networkx as nx
+
+
+class SemanticMatchGraph(nx.DiGraph):
+    def __init__(self):
+        super().__init__()
+
+    def add_semantic_match(self,
+                           base_semantic_id: str,
+                           match_semantic_id: str,
+                           score: float):
+        self.add_edge(
+            u_of_edge=base_semantic_id,
+            v_of_edge=match_semantic_id,
+            weight=score,
+        )
+
+
+class MatchResult:
+    base_semantic_id: str
+    match_semantic_id: str
+    score: float
+    path: List[str]  # The path of `semantic_id`s that the algorithm took
+
+    def __init__(self,
+                 base_semantic_id: str,
+                 match_semantic_id: str,
+                 score: float,
+                 path: List[str]):
+        self.base_semantic_id = base_semantic_id
+        self.match_semantic_id = match_semantic_id
+        self.score = score
+        self.path = path
+
+    def __repr__(self) -> str:
+        return f"{' -> '.join(self.path + [self.match_semantic_id])} = {self.score}"
+
+
+def find_semantic_matches(
+    graph: SemanticMatchGraph,
+    semantic_id: str,
+    min_score: float = 0.5
+) -> List[MatchResult]:
+    """
+    Find semantic matches for a given node with a minimum score threshold.
+
+    Args:
+        graph (nx.DiGraph): The directed graph with weighted edges.
+        semantic_id (str): The starting semantic_id.
+        min_score (float): The minimum similarity score to consider.
+            This value is necessary to ensure the search terminates also with sufficiently large graphs.
+
+    Returns:
+        List[MatchResult]:
+        A list of MatchResults, sorted by their score with the highest score first.
+    """
+    if semantic_id not in graph:
+        return []
+
+    # We need to make sure that all possible paths starting from the given semantic_id are explored.
+    # To achieve this, we use the concept of "priority queue". While we could use a simple FIFO list of matches to
+    # explore, this way we actually end up with an already sorted result with the highest match at the beginning of the
+    # list. As possible implementation of this abstract data structure, we choose to use a "max-heap".
+    # However, there is no efficient implementation of a max-heap in Python, so rather we use the built-in "min-heap"
+    # and negate the score values. A priority queue ensures that elements with the highest priority are processed first,
+    # regardless of when they were added.
+    # We initialize the priority queue:
+    pq: List[Tuple[float, str, List[str]]] = [(-1.0, semantic_id, [])]  # (neg_score, node, path)
+    # The queue is structured as follows:
+    #   - `neg_score`: The negative score of the match
+    #   - `node`: The `match_semantic_id` of the match
+    #   - `path`: The path between the `semantic_id` and the `match_semantic_id`
+
+    # Prepare the result list
+    results: List[MatchResult] = []
+
+    # Run the priority queue until all possible paths have been explored
+    # This means in each iteration:
+    #   - We pop the top element of the queue as it's the next highest semantic match we want to explore
+    #   - If the match has a score higher or equal to the given `min_score`, we add it to the results
+    #   - We add all connected `semantic_id`s to the priority queue to be treated next
+    #   - We go to the next element of the queue
+    while pq:
+        # Get the highest-score match from the queue
+        neg_score, node, path = heapq.heappop(pq)
+        score = -neg_score  # Convert back to positive
+
+        # Store result if above threshold (except the start node)
+        if node != semantic_id and score >= min_score:
+            results.append(MatchResult(
+                base_semantic_id=semantic_id,
+                match_semantic_id=node,
+                score=score,
+                path=path
+            ))
+
+        # Traverse to the neighboring and therefore connected `semantic_id`s
+        for neighbor, edge_data in graph[node].items():
+            new_score: float = score * edge_data["weight"]  # Multiplicative propagation
+
+            # Prevent loops by ensuring we do not revisit the start node after the first iteration
+            if neighbor == semantic_id:
+                continue  # Avoid re-exploring the start node
+
+            # We add the newly found `semantic_id`s to the queue to be explored next in order of their score
+            if new_score >= min_score:
+                heapq.heappush(pq, (-new_score, neighbor, path + [node]))  # Push updated path
+
+    return results
+
+
+if __name__ == "__main__":
+    # Create graph
+    G = SemanticMatchGraph()
+    G.add_edge("A", "B", weight=0.8)
+    G.add_edge("B", "C", weight=0.7)
+    G.add_edge("C", "D", weight=0.9)
+    G.add_edge("B", "D", weight=0.6)
+
+    # Find matches for "A"
+    matches: List[MatchResult] = find_semantic_matches(G, "A", min_score=0)
+
+    # Print results
+    for match in matches:
+        print(match)
+
@@ -1,20 +1,171 @@
-from typing import Dict, List
+import json
+import copy
+from typing import Dict, List, Set, Optional, Iterable
 
 from pydantic import BaseModel
 
 
+# Todo: Adapt to use the new algorithm
+
+
 class SemanticMatch(BaseModel):
     """
     A semantic match, mapping two semanticIDs with a matching score. Can be imagined as a weighted graph with
     `base_semantic_id` ---`score`---> `match_semantic_id`
 
-    Todo: Think about static and TTL, but that is optimization
-    Todo: Maybe we want to have the matching method as debug information
+    :cvar base_semantic_id:
+    :cvar match_semantic_id:
+    :cvar score: The semantic similarity score, a float between 0 and 1
+    :cvar path: Optionally, if the `SemanticMatch` did not come from a source but is inferred by another `SemanticMatch`
+        the `path` stores the SemanticMatches it came from
+    :cvar meta_information: Optional meta_information, such as the source of the `SemanticMatch`
     """
     base_semantic_id: str
     match_semantic_id: str
     score: float
-    meta_information: Dict
+    path: Optional[List["SemanticMatch"]] = None
+    meta_information: Optional[Dict] = None
+
+    def __hash__(self):
+        return hash((
+            self.base_semantic_id,
+            self.match_semantic_id,
+            self.score,
+            self.path,
+            frozenset(self.meta_information.items())
+        ))
+
+    @classmethod
+    def combine_semantic_matches(cls, first: "SemanticMatch", second: "SemanticMatch") -> "SemanticMatch":
+        """
+        Construct a new `SemanticMatch` by combining two `SemanticMatch`es.
+
+        Given the following situation:
+            A --0.4--> B
+            B --0.5--> C
+        this constructs a new `SemanticMatch`:
+            A --(0.4*0.5)--> C
+        while updating the `path` information of the new `SemanticMatch`
+
+        :param first: First `SemanticMatch`
+        :param second: Second `SemanticMatch`. Note that `second.base_semantic_id` needs to be the same
+            as `first.match_semantic_id`
+        :return: The combined `SemanticMatch`
+        """
+        if not first.match_semantic_id == second.base_semantic_id:
+            raise KeyError(f"Cannot combine. `first.match_semantic_id` ({first.match_semantic_id}) does not "
+                           f"fit `second.base_semantic_id` ({second.base_semantic_id}).")
+        if second.path:
+            new_path = copy.copy(second.path)
+            new_path.insert(0, second)
+        else:
+            new_path = [second]
+        return SemanticMatch(
+            base_semantic_id=first.base_semantic_id,
+            match_semantic_id=second.match_semantic_id,
+            score=first.score*second.score,
+            path=new_path,
+        )
+
+class SemanticMatchDictStore:
+    """
+    A collection of `SemanticMatch`es, stored in a Dict, where the Key is the `base_semantic_id` and the Value is
+    the `SemanticMatch` object. This allows for efficient resolution of the `SemanticMatches` of the `base_semantic_id`.
+    """
+    def __init__(self, matches: Iterable[SemanticMatch]):
+        self._store: Dict[str, Set[SemanticMatch]] = {}
+        for x in matches:
+            self.add(x)
+
+    def add(self, match: SemanticMatch) -> None:
+        """
+        Add a `SemanticMatch` to the store
+        """
+        if match.base_semantic_id in self._store:
+            self._store[match.base_semantic_id].add(match)
+        else:
+            self._store[match.base_semantic_id] = {match}
+
+    def discard(self, match: SemanticMatch) -> None:
+        """
+        Discard a `SemanticMatch` from the store
+        """
+        # First we remove the `SemanticMatch` from the set of matches for that `base_semantic_id`
+        self._store[match.base_semantic_id].discard(match)
+        # Then, if there is no more `SemanticMatch`es for that `base_semantic_id`, we remove the Dict entry completely
+        if not len(self._store[match.base_semantic_id]):
+            self._store.pop(match.base_semantic_id)
+
+    def get_all_matches(self) -> Set[SemanticMatch]:
+        """
+        Return a set of all `SemanticMatch`es currently inside the store
+        """
+        all_matches: Set[SemanticMatch] = set()
+        for i in self._store.values():
+            all_matches.update(i)
+        return all_matches
+
+    def get_matches(self, semantic_id: str, min_score: Optional[float] = None) -> Set[SemanticMatch]:
+        """
+        Return all 'SemanticMatches' of a given semantic_id currently inside a store that have a higher or equal
+        score than the `min_score`.
+        This is a recursive function, that also queries the matches of the matches, as long as the multiplicative
+        scores of the matches is still higher or equal to the `min_score`.
+        """
+        matches: Set[SemanticMatch] = set()  # This is our return Set
+
+        # First, we check on the current level
+        current_matches_with_any_score = self._store.get(semantic_id, set())
+        current_matches = {
+            match for match in current_matches_with_any_score if min_score is None or match.score >= min_score
+        }
+        # We can already update our return Set, since we know that the `current_matches` will definitely be inside
+        matches.update(current_matches)
+
+        # Now we do the same query each of the current_matches that have a score larger or equal to min_score
+        # Todo: We currently have a loop in here that we need to break
+        for match in current_matches:
+            # We calculate the new minimal score
+            # Unified score is multiplied: score(A->B) * score(B->C)
+            # This score should be larger or equal than the requested min_score:
+            # score(A->B) * score(B->C) >= min_score
+            # score(A->B) is well known, as it is the `match.score`
+            # => score(B->C) >= (min_score/score(A->B))
+            if min_score:
+                new_min_score = min_score/match.score
+            else:
+                new_min_score = min_score
+            # Here's the recursive function call, we do the same thing again with the new matches and the
+            # updated `min_score`:
+            new_matches = self.get_matches(semantic_id=match.base_semantic_id, min_score=new_min_score)
+            # These new matches are now not relative to the original `base_semantic_id`, so we need to create new
+            # `SemanticMatch`es and somehow store the path.
+            for new_match in new_matches:
+                matches.add(SemanticMatch.combine_semantic_matches(
+                    first=match,
+                    second=new_match
+                ))
+
+        # In the end, we return our return Set
+        return matches
+
+    def to_file(self, filename: str) -> None:
+        matches: List[Dict] = [match.model_dump() for match in self.get_all_matches()]
+        with open(filename, "w") as file:
+            json.dump(matches, file, indent=4)
+
+    @classmethod
+    def from_file(cls, filename: str) -> "SemanticMatchDictStore":
+        with open(filename, "r") as file:
+            matches_data = json.load(file)
+        matches = [SemanticMatch(**match_dict) for match_dict in matches_data]
+        return cls(matches)
+
+    def __len__(self) -> int:
+        length = 0
+        for i in self._store.values():
+            length += len(i)
+        return length
 
 
 class EquivalenceTable(BaseModel):