Skip to content

Commit

Permalink
feat: Regex delimiters
Browse files Browse the repository at this point in the history
  • Loading branch information
simjak committed Jul 19, 2024
1 parent dfb2713 commit fe63881
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 29 deletions.
13 changes: 10 additions & 3 deletions semantic_chunkers/chunkers/regex.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import asyncio
from typing import List
from typing import List, Union

import regex

from semantic_chunkers.chunkers.base import BaseChunker
from semantic_chunkers.schema import Chunk
Expand All @@ -8,9 +10,14 @@


class RegexChunker(BaseChunker):
def __init__(self, max_chunk_tokens: int = 300):
def __init__(
self,
max_chunk_tokens: int = 300,
delimiters: List[Union[str, regex.Pattern]] = [],
):
super().__init__(name="regex_chunker", encoder=None, splitter=RegexSplitter())
self.max_chunk_tokens = max_chunk_tokens
self.delimiters = delimiters

def __call__(self, docs: list[str]) -> List[List[Chunk]]:
chunks = []
Expand All @@ -22,7 +29,7 @@ def __call__(self, docs: list[str]) -> List[List[Chunk]]:

for doc in docs:
regex_splitter = RegexSplitter()
sentences = regex_splitter(doc)
sentences = regex_splitter(doc, delimiters=self.delimiters)
for sentence in sentences:
sentence_token_count = text.tiktoken_length(sentence)

Expand Down
54 changes: 31 additions & 23 deletions semantic_chunkers/splitters/regex.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List
from typing import List, Union

import regex

Expand All @@ -8,13 +8,6 @@
class RegexSplitter(BaseSplitter):
"""
Enhanced regex pattern to split a given text into sentences more accurately.
The enhanced regex pattern includes handling for:
- Direct speech and quotations.
- Abbreviations, initials, and acronyms.
- Decimal numbers and dates.
- Ellipses and other punctuation marks used in informal text.
- Removing control characters and format characters.
"""

regex_pattern = r"""
Expand Down Expand Up @@ -49,21 +42,36 @@ class RegexSplitter(BaseSplitter):
|
# Matches and removes control characters and format characters
[\p{Cc}\p{Cf}]+
# OR
|
# Splits after punctuation marks followed by another punctuation mark
(?<=[\.!?])(?=[\.!?])
# OR
|
# Splits after exclamation or question marks followed by whitespace or end of string
(?<=[!?])(?=\s|$)
"""

def __call__(self, doc: str) -> List[str]:
# Step 1: Split by \n\n
chunks = doc.split("\n\n")
sentences = []
for chunk in chunks:
# Step 2: Split by \n within each chunk
sub_chunks = chunk.split("\n")
for sub_chunk in sub_chunks:
# Step 3: Split by regex pattern within each sub_chunk
sub_sentences = regex.split(
self.regex_pattern, sub_chunk, flags=regex.VERBOSE
)
for sentence in sub_sentences:
if sentence.strip():
sentences.append(sentence.strip())
def __call__(
self, doc: str, delimiters: List[Union[str, regex.Pattern]] = []
) -> List[str]:
# Ensure the regex pattern is applied last
delimiters.append(regex.compile(self.regex_pattern, flags=regex.VERBOSE))

sentences = [doc]
for delimiter in delimiters:
sentences_for_next_delimiter = []
for sentence in sentences:
if isinstance(delimiter, regex.Pattern):
sub_sentences = delimiter.split(sentence)
split_char = "" # No single character to append for regex pattern
else:
sub_sentences = sentence.split(delimiter)
split_char = delimiter
for i, sub_sentence in enumerate(sub_sentences):
if i < len(sub_sentences) - 1:
sub_sentence += split_char
if sub_sentence.strip():
sentences_for_next_delimiter.append(sub_sentence.strip())
sentences = sentences_for_next_delimiter
return sentences
18 changes: 15 additions & 3 deletions tests/unit/test_regex_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,19 @@ def setUp(self):
def test_split_by_double_newline(self):
doc = "This is the first paragraph.\n\nThis is the second paragraph."
expected = ["This is the first paragraph.", "This is the second paragraph."]
result = self.splitter(doc)
result = self.splitter(doc, delimiters=["\n\n"])
self.assertEqual(result, expected)

def test_split_by_single_newline(self):
doc = "This is the first line.\nThis is the second line."
expected = ["This is the first line.", "This is the second line."]
result = self.splitter(doc)
result = self.splitter(doc, delimiters=["\n"])
self.assertEqual(result, expected)

def test_split_by_period(self):
doc = "This is the first sentence. This is the second sentence."
expected = ["This is the first sentence.", "This is the second sentence."]
result = self.splitter(doc)
result = self.splitter(doc, delimiters=["."])
self.assertEqual(result, expected)

def test_complex_split(self):
Expand All @@ -35,6 +35,18 @@ def test_complex_split(self):
"Fourth line.",
"Fifth paragraph.",
]
result = self.splitter(doc, delimiters=["\n\n", "\n", "."])
self.assertEqual(result, expected)

def test_custom_delimiters(self):
doc = "First part|Second part|Third part"
expected = ["First part|", "Second part|", "Third part"]
result = self.splitter(doc, delimiters=["|"])
self.assertEqual(result, expected)

def test_regex_split(self):
doc = "This is a sentence. And another one! Yet another?"
expected = ["This is a sentence.", "And another one!", "Yet another?"]
result = self.splitter(doc)
self.assertEqual(result, expected)

Expand Down

0 comments on commit fe63881

Please sign in to comment.