Add 'Longest Common Subsequence'

nickolashkraus · nickolashkraus · commit a9b2d4dc9208 · 2025-01-16T10:59:35.000-06:00
diff --git a/README.md b/README.md
@@ -22,6 +22,8 @@ A collection of LeetCode solutions
 
 [Linked List Cycle](./src/linked_list_cycle.py)
 
+[Longest Common Subsequence](./src/longest_common_subsequence.py)
+
 [Maximum Depth of Binary Tree](./src/maximum_depth_of_binary_tree.py)
 
 [Maximum Subarray](./src/maximum_subarray.py)
diff --git a/src/longest_common_subsequence.py b/src/longest_common_subsequence.py
@@ -6,43 +6,42 @@
 NOTES
   * Use dynamic programming (2D) or recursion.
 
-  * A common subsequence is a sequence of letters that appears in both strings.
-    Not every letter in the strings has to be used, but letters cannot be
-    rearranged. In essence, a subsequence of a string 's' is a string we get by
-    deleting some letters in 's'.
-
-  * The most obvious approach would be to iterate through each subsequence of
-    the first string and check whether or not it is also a subsequence of the
-    second string. This, however, will require exponential time to run. The
-    number of subsequences in a string is up to 2^L, where L is the length of
-    the string.
-
-  * There are a couple of strategies we use to design a tractable
-    (non-exponential) algorithm for an optimization problem:
-
-      1. Identifying a greedy algorithm
-      2. Dynamic programming
-
-  * There is no guarantee that either is possible. Additionally, greedy
-    algorithms are strictly less common than dynamic programming algorithms and
-    are often more difficult to identify. However, if a greedy algorithm
-    exists, then it will almost always be better than a dynamic programming
-    one. You should, therefore, at least give some thought to the potential
-    existence of a greedy algorithm before jumping straight into dynamic
-    programming.
-
-  * Recall that there are two different techniques we can use to implement a
-    dynamic programming solution; memoization and tabulation.
-
-      * Memoization is where we add caching to a function (that has no side
-        effects). In dynamic programming, it is typically used on recursive
-        functions for a top-down solution that starts with the initial problem
-        and then recursively calls itself to solve smaller problems.
-
-      * Tabulation uses a table to keep track of subproblem results and works
-        in a bottom-up manner: solving the smallest subproblems before the
-        large ones, in an iterative manner. Often, people use the words
-        "tabulation" and "dynamic programming" interchangeably.
+A common subsequence is a sequence of letters that appears in both strings. Not
+every letter in the string has to be used, but letters cannot be rearranged. In
+essence, a subsequence of a string 's' is a string we get by deleting some
+letters in 's'.
+
+The most obvious approach would be to iterate through each subsequence of the
+first string and check whether or not it is also a subsequence of the second
+string. This, however, will require exponential time to run. The number of
+subsequences in a string is up to 2^L, where L is the length of the string.
+
+There are a couple of strategies we can use to design a tractable
+(non-exponential) algorithm for an optimization problem:
+
+  1. Identifying a greedy algorithm
+  2. Dynamic programming
+
+There is no guarantee that either is possible. Additionally, greedy algorithms
+are strictly less common than dynamic programming algorithms and are often more
+difficult to identify. However, if a greedy algorithm exists, then it will
+almost always be better than a dynamic programming one. You should, therefore,
+at least give some thought to the potential existence of a greedy algorithm
+before jumping straight into dynamic programming.
+
+Recall that there are two different techniques we can use to implement a
+dynamic programming solution: tabulation and memoization.
+
+  * Tabulation uses a table to keep track of subproblem results and works in a
+    bottom-up manner: solving the smallest subproblems before the large ones,
+    in an iterative manner. Often, people use the words "tabulation" and
+    "dynamic programming" interchangeably.
+
+  * Memoization is where we add caching to a function (that has no side
+    effects). In dynamic programming, it is typically used on recursive
+    functions for a top-down solution that starts with the initial problem and
+    then recursively calls itself to solve smaller problems. Memoization is
+    useful when a problem has overlapping subproblems.
 """
 
 
@@ -60,44 +59,98 @@ class Solution:
     subproblems, the smaller ones that they depend on will already have been
     solved. The best way to do this is to use a 2D array.
 
-    Remembering back to the memoization solution, there were two cases.
+    There are two cases when considering the optimal solution of the
+    subproblem:
 
       1. The first letter of both strings are the same.
-      2. The first letter of both strings are *not* the same.
+      2. The first letter of both strings are not the same.
     """
 
     def longestCommonSubsequence(self, text1: str, text2: str) -> int:
+        """
+        Given two strings, `text1` and `text2`, compute the length of their
+        Longest Common Subsequence (LCS).
+        """
+        # Create an m×n matrix initialized to 0s, where m is the number of rows
+        # (|text1| + 1) and n is the number of columns (|text2| + 1).
+        #
+        # dp[0...m, 0] and dp[0, 0...n] are set to 0. This represents our base
+        # case:
+        #
+        #   If either sequence is empty, the LCS length is 0.
+        m, n = len(text1) + 1, len(text2) + 1
+        dp: list[list[int]] = [[0 for j in range(n)] for i in range(m)]
+
+        # Fill the remaining matrix for all remaining prefixes. For each
+        # position dp[i][j], we calculate:
+        #
+        #   If text1[i-1] == text2[j-1], dp[i][j] = 1 + dp[i-1][j-1]
+        #
+        # This means we include the current matching character and add 1 to the
+        # previous LCS length.
+        #
+        #   Else, dp[i][j] = max(dp[i-1][j], dp[i][j-1])
+        #
+        # This means we take the maximum LCS length when excluding either the
+        # current character from sequence `text1` or sequence `text2`.
+        for i in range(1, m):
+            for j in range(1, n):
+                if text1[i - 1] == text2[j - 1]:
+                    dp[i][j] = 1 + dp[i - 1][j - 1]
+                else:
+                    dp[i][j] = max(
+                        dp[i - 1][j],  # Exclude the character at position i in `text1`
+                        dp[i][j - 1],  # Exclude the character at position j in `text2`
+                    )
+        # NOTE: dp[m - 1][n - 1] is equivalent to dp[i][j].
+        return dp[m - 1][n - 1]
+
+
+class AlternativeSolution:
+    """
+    Typically, the length of the Longest Common Subsequence (LCS) is given by
+    the value of dp[i][j], however, we can also solve the problem in reverse.
+    This results in the solution being located at dp[0][0]. Though slightly
+    less intuitive, this allows us to use the same indices for the string and
+    matrix.
+    """
+
+    def longestCommonSubsequence(self, text1: str, text2: str) -> int:
+        """
+        Given two strings, `text1` and `text2`, compute the length of their
+        Longest Common Subsequence (LCS) using a reverse iteration approach.
+        """
         # Initializing the table to 0 allows us to calculate the current
         # subproblem from previous subproblems.
         #
-        #     a b c d e -    i →
-        #   a 0 0 0 0 0 0  j
-        #   c 0 0 0 0 0 0  ↓
-        #   e 0 0 0 0 0 0
-        #   - 0 0 0 0 0 0
-        #
-        #     a b c d e -    i →
-        #   a 3 2 2 1 1 0  j
-        #   c 2 2 2 1 1 0  ↓
-        #   e 1 1 1 1 1 0
-        #   - 0 0 0 0 0 0
+        #     a c e -    j →
+        #   a 0 0 0 0  i
+        #   b 0 0 0 0  ↓
+        #   c 0 0 0 0
+        #   d 0 0 0 0
+        #   e 0 0 0 0
+        #   - 0 0 0 0
         #
-        # where a,a is (0,0) and e,e is (5,3) (for i,j).
-        col, row = len(text1) + 1, len(text2) + 1
-        dp: list[list[int]] = [[0 for _ in range(col)] for _ in range(row)]
+        #     a c e -    j →
+        #   a 3 2 1 0  i
+        #   b 2 2 1 0  ↓
+        #   c 2 2 1 0
+        #   d 1 1 1 0
+        #   e 1 1 1 0
+        #   - 0 0 0 0
+        m, n = len(text1) + 1, len(text2) + 1
+        dp: list[list[int]] = [[0 for j in range(n)] for i in range(m)]
 
         # Iterate over the table in reverse (first by column, then by row).
-        for i in reversed(range(len(text2))):
-            for j in reversed(range(len(text1))):
+        for i in reversed(range(len(text1))):
+            for j in reversed(range(len(text2))):
                 # 1. The first letter of both strings are the same.
-                if text1[j] == text2[i]:
+                if text1[i] == text2[j]:
                     dp[i][j] = 1 + dp[i + 1][j + 1]
                 # 2. The first letter of both strings are *not* the same.
                 else:
                     dp[i][j] = max(dp[i][j + 1], dp[i + 1][j])
-        # NOTE: Uncomment to print the result of the table.
-        # for r in dp:
-        #     print(r)
+
         return dp[0][0]
 
 
@@ -110,30 +163,34 @@ class MemoizationSolution:
     """
 
     def longestCommonSubsequence(self, text1: str, text2: str) -> int:
+        """
+        Given two strings, `text1` and `text2`, compute the length of their
+        Longest Common Subsequence (LCS) using memoization.
+        """
         # Initializing the memoization table to -1 allows us to determine
         # whether or not the value has been calculated.
         #
-        #     a b c d e    i →
-        #   a . . . . .  j
-        #   c . . . . .  ↓
-        #   e . . . . .
-        #
-        # where a,a is (0,0) and e,e is (5,3) (for i,j).
-        col, row = len(text1), len(text2)
-        memo: list[list[int]] = [[-1 for _ in range(col)] for _ in range(row)]
+        #     a c e    j →
+        #   a . . .  i
+        #   b . . .  ↓
+        #   c . . .
+        #   d . . .
+        #   e . . .
+        m, n = len(text1), len(text2)
+        memo: list[list[int]] = [[-1 for j in range(n)] for i in range(m)]
 
         def lcs(s1: str, s2: str, memo: list[list[int]]) -> int:
-            col, row = len(memo[0]), len(memo)
-            if s1 == "" or s2 == "":
+            if not s1 or not s2:
                 return 0
+            # Calculate current position in memo table
+            i, j = len(memo) - len(s1), len(memo[0]) - len(s2)
             # Check whether we've already solved the given subproblem.
-            i, j = row - len(s2), col - len(s1)
             if memo[i][j] != -1:
                 return memo[i][j]
             if s1[0] == s2[0]:
                 memo[i][j] = 1 + lcs(s1[1:], s2[1:], memo)
             else:
-                memo[i][j] = max(lcs(s1[0:], s2[1:], memo), lcs(s1[1:], s2[0:], memo))
+                memo[i][j] = max(lcs(s1[1:], s2, memo), lcs(s1, s2[1:], memo))
             return memo[i][j]
 
         return lcs(text1, text2, memo)
@@ -167,18 +224,23 @@ class RecursiveSolution:
 
       Finally, we formalize the above cases in code.
 
-    This solution is O(M x N), where where `M` is the length of the first
-    string and `N` is the length of the second string.
+    This solution is O(2^(M + N)), where `M` is the length of the first string
+    and `N` is the length of the second string.
 
-    NOTE: This solution exceeds the time limit.
+    NOTE: Though *technically* correct, this solution exceeds the time limit,
+    since it does not account for overlapping subproblems.
     """
 
     def longestCommonSubsequence(self, text1: str, text2: str) -> int:
+        """
+        Given two strings, `text1` and `text2`, compute the length of their
+        Longest Common Subsequence (LCS) using pure recursion.
+        """
         def lcs(s1: str, s2: str) -> int:
-            if s1 == "" or s2 == "":
+            if not s1 or not s2:
                 return 0
             if s1[0] == s2[0]:
                 return 1 + lcs(s1[1:], s2[1:])
-            return max(lcs(s1[0:], s2[1:]), lcs(s1[1:], s2[0:]))
+            return max(lcs(s1[1:], s2), lcs(s1, s2[1:]))
 
         return lcs(text1, text2)
diff --git a/tests/test_longest_common_subsequence.py b/tests/test_longest_common_subsequence.py
@@ -6,11 +6,7 @@
 
 from unittest import TestCase
 
-from src.longest_common_subsequence import (
-    MemoizationSolution,
-    RecursiveSolution,
-    Solution,
-)
+from src.longest_common_subsequence import AlternativeSolution, MemoizationSolution, RecursiveSolution, Solution
 
 
 class TestSolution(TestCase):
@@ -31,6 +27,24 @@ def test_4(self):
         assert Solution().longestCommonSubsequence("pmjghexybyrgzczy", "hafcdqbgncrcbihkd") == exp
 
 
+class TestAlternativeSolution(TestCase):
+    def test_1(self):
+        exp = 3
+        assert AlternativeSolution().longestCommonSubsequence("abcde", "ace") == exp
+
+    def test_2(self):
+        exp = 3
+        assert AlternativeSolution().longestCommonSubsequence("abc", "abc") == exp
+
+    def test_3(self):
+        exp = 0
+        assert AlternativeSolution().longestCommonSubsequence("abc", "def") == exp
+
+    def test_4(self):
+        exp = 4
+        assert AlternativeSolution().longestCommonSubsequence("pmjghexybyrgzczy", "hafcdqbgncrcbihkd") == exp
+
+
 class TestMemoizationSolution(TestCase):
     def test_1(self):
         exp = 3