feat: Regex delimiters

aurelio-labs · Jul 19, 2024 · fe63881 · fe63881
1 parent dfb2713
commit fe63881
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 29 deletions.
diff --git a/semantic_chunkers/chunkers/regex.py b/semantic_chunkers/chunkers/regex.py
@@ -1,5 +1,7 @@
 import asyncio
-from typing import List
+from typing import List, Union
+
+import regex
 
 from semantic_chunkers.chunkers.base import BaseChunker
 from semantic_chunkers.schema import Chunk
@@ -8,9 +10,14 @@
 
 
 class RegexChunker(BaseChunker):
-    def __init__(self, max_chunk_tokens: int = 300):
+    def __init__(
+        self,
+        max_chunk_tokens: int = 300,
+        delimiters: List[Union[str, regex.Pattern]] = [],
+    ):
         super().__init__(name="regex_chunker", encoder=None, splitter=RegexSplitter())
         self.max_chunk_tokens = max_chunk_tokens
+        self.delimiters = delimiters
 
     def __call__(self, docs: list[str]) -> List[List[Chunk]]:
         chunks = []
@@ -22,7 +29,7 @@ def __call__(self, docs: list[str]) -> List[List[Chunk]]:
 
         for doc in docs:
             regex_splitter = RegexSplitter()
-            sentences = regex_splitter(doc)
+            sentences = regex_splitter(doc, delimiters=self.delimiters)
             for sentence in sentences:
                 sentence_token_count = text.tiktoken_length(sentence)
 

diff --git a/semantic_chunkers/splitters/regex.py b/semantic_chunkers/splitters/regex.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Union
 
 import regex
 
@@ -8,13 +8,6 @@
 class RegexSplitter(BaseSplitter):
     """
     Enhanced regex pattern to split a given text into sentences more accurately.
-
-    The enhanced regex pattern includes handling for:
-    - Direct speech and quotations.
-    - Abbreviations, initials, and acronyms.
-    - Decimal numbers and dates.
-    - Ellipses and other punctuation marks used in informal text.
-    - Removing control characters and format characters.
     """
 
     regex_pattern = r"""
@@ -49,21 +42,36 @@ class RegexSplitter(BaseSplitter):
         |
         # Matches and removes control characters and format characters
         [\p{Cc}\p{Cf}]+
+        # OR
+        |
+        # Splits after punctuation marks followed by another punctuation mark
+        (?<=[\.!?])(?=[\.!?])
+        # OR
+        |
+        # Splits after exclamation or question marks followed by whitespace or end of string
+        (?<=[!?])(?=\s|$)
     """
 
-    def __call__(self, doc: str) -> List[str]:
-        # Step 1: Split by \n\n
-        chunks = doc.split("\n\n")
-        sentences = []
-        for chunk in chunks:
-            # Step 2: Split by \n within each chunk
-            sub_chunks = chunk.split("\n")
-            for sub_chunk in sub_chunks:
-                # Step 3: Split by regex pattern within each sub_chunk
-                sub_sentences = regex.split(
-                    self.regex_pattern, sub_chunk, flags=regex.VERBOSE
-                )
-                for sentence in sub_sentences:
-                    if sentence.strip():
-                        sentences.append(sentence.strip())
+    def __call__(
+        self, doc: str, delimiters: List[Union[str, regex.Pattern]] = []
+    ) -> List[str]:
+        # Ensure the regex pattern is applied last
+        delimiters.append(regex.compile(self.regex_pattern, flags=regex.VERBOSE))
+
+        sentences = [doc]
+        for delimiter in delimiters:
+            sentences_for_next_delimiter = []
+            for sentence in sentences:
+                if isinstance(delimiter, regex.Pattern):
+                    sub_sentences = delimiter.split(sentence)
+                    split_char = ""  # No single character to append for regex pattern
+                else:
+                    sub_sentences = sentence.split(delimiter)
+                    split_char = delimiter
+                for i, sub_sentence in enumerate(sub_sentences):
+                    if i < len(sub_sentences) - 1:
+                        sub_sentence += split_char
+                    if sub_sentence.strip():
+                        sentences_for_next_delimiter.append(sub_sentence.strip())
+            sentences = sentences_for_next_delimiter
         return sentences
diff --git a/tests/unit/test_regex_splitter.py b/tests/unit/test_regex_splitter.py
@@ -10,19 +10,19 @@ def setUp(self):
     def test_split_by_double_newline(self):
         doc = "This is the first paragraph.\n\nThis is the second paragraph."
         expected = ["This is the first paragraph.", "This is the second paragraph."]
-        result = self.splitter(doc)
+        result = self.splitter(doc, delimiters=["\n\n"])
         self.assertEqual(result, expected)
 
     def test_split_by_single_newline(self):
         doc = "This is the first line.\nThis is the second line."
         expected = ["This is the first line.", "This is the second line."]
-        result = self.splitter(doc)
+        result = self.splitter(doc, delimiters=["\n"])
         self.assertEqual(result, expected)
 
     def test_split_by_period(self):
         doc = "This is the first sentence. This is the second sentence."
         expected = ["This is the first sentence.", "This is the second sentence."]
-        result = self.splitter(doc)
+        result = self.splitter(doc, delimiters=["."])
         self.assertEqual(result, expected)
 
     def test_complex_split(self):
@@ -35,6 +35,18 @@ def test_complex_split(self):
             "Fourth line.",
             "Fifth paragraph.",
         ]
+        result = self.splitter(doc, delimiters=["\n\n", "\n", "."])
+        self.assertEqual(result, expected)
+
+    def test_custom_delimiters(self):
+        doc = "First part|Second part|Third part"
+        expected = ["First part|", "Second part|", "Third part"]
+        result = self.splitter(doc, delimiters=["|"])
+        self.assertEqual(result, expected)
+
+    def test_regex_split(self):
+        doc = "This is a sentence. And another one! Yet another?"
+        expected = ["This is a sentence.", "And another one!", "Yet another?"]
         result = self.splitter(doc)
         self.assertEqual(result, expected)