Skip to content

Commit

Permalink
feat: update regex splitter
Browse files Browse the repository at this point in the history
  • Loading branch information
simjak committed Jul 19, 2024
1 parent 21e8571 commit 30182ba
Show file tree
Hide file tree
Showing 8 changed files with 62 additions and 7 deletions.
2 changes: 1 addition & 1 deletion semantic_chunkers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
"ConsecutiveChunker",
"CumulativeChunker",
"StatisticalChunker",
"BaseSplitter",
"RegexSplitter",
"BaseSplitter",
]

__version__ = "0.0.8"
2 changes: 1 addition & 1 deletion semantic_chunkers/chunkers/consecutive.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from semantic_chunkers.chunkers.base import BaseChunker
from semantic_chunkers.schema import Chunk
from semantic_chunkers.splitters.base import BaseSplitter
from semantic_chunkers.splitters.sentence import RegexSplitter
from semantic_chunkers.splitters.regex import RegexSplitter


class ConsecutiveChunker(BaseChunker):
Expand Down
2 changes: 1 addition & 1 deletion semantic_chunkers/chunkers/cumulative.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from semantic_chunkers.chunkers.base import BaseChunker
from semantic_chunkers.schema import Chunk
from semantic_chunkers.splitters.base import BaseSplitter
from semantic_chunkers.splitters.sentence import RegexSplitter
from semantic_chunkers.splitters.regex import RegexSplitter


class CumulativeChunker(BaseChunker):
Expand Down
2 changes: 1 addition & 1 deletion semantic_chunkers/chunkers/statistical.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from semantic_chunkers.chunkers.base import BaseChunker
from semantic_chunkers.schema import Chunk
from semantic_chunkers.splitters.base import BaseSplitter
from semantic_chunkers.splitters.sentence import RegexSplitter
from semantic_chunkers.splitters.regex import RegexSplitter
from semantic_chunkers.utils.logger import logger
from semantic_chunkers.utils.text import (
async_retry_with_timeout,
Expand Down
2 changes: 1 addition & 1 deletion semantic_chunkers/splitters/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from semantic_chunkers.splitters.base import BaseSplitter
from semantic_chunkers.splitters.sentence import RegexSplitter
from semantic_chunkers.splitters.regex import RegexSplitter

__all__ = [
"BaseSplitter",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,18 @@ class RegexSplitter(BaseSplitter):
"""

def __call__(self, doc: str) -> List[str]:
sentences = regex.split(self.regex_pattern, doc, flags=regex.VERBOSE)
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
# Step 1: Split by \n\n
chunks = doc.split("\n\n")
sentences = []
for chunk in chunks:
# Step 2: Split by \n within each chunk
sub_chunks = chunk.split("\n")
for sub_chunk in sub_chunks:
# Step 3: Split by regex pattern within each sub_chunk
sub_sentences = regex.split(
self.regex_pattern, sub_chunk, flags=regex.VERBOSE
)
for sentence in sub_sentences:
if sentence.strip():
sentences.append(sentence.strip())
return sentences
File renamed without changes.
43 changes: 43 additions & 0 deletions tests/unit/test_regex_splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import unittest

from semantic_chunkers.splitters.regex import RegexSplitter


class TestRegexSplitter(unittest.TestCase):
def setUp(self):
self.splitter = RegexSplitter()

def test_split_by_double_newline(self):
doc = "This is the first paragraph.\n\nThis is the second paragraph."
expected = ["This is the first paragraph.", "This is the second paragraph."]
result = self.splitter(doc)
self.assertEqual(result, expected)

def test_split_by_single_newline(self):
doc = "This is the first line.\nThis is the second line."
expected = ["This is the first line.", "This is the second line."]
result = self.splitter(doc)
self.assertEqual(result, expected)

def test_split_by_period(self):
doc = "This is the first sentence. This is the second sentence."
expected = ["This is the first sentence.", "This is the second sentence."]
result = self.splitter(doc)
self.assertEqual(result, expected)

def test_complex_split(self):
doc = """
First paragraph.\n\nSecond paragraph.\nThird line in second paragraph. Fourth line.\n\nFifth paragraph."""
expected = [
"First paragraph.",
"Second paragraph.",
"Third line in second paragraph.",
"Fourth line.",
"Fifth paragraph.",
]
result = self.splitter(doc)
self.assertEqual(result, expected)


if __name__ == "__main__":
unittest.main()

0 comments on commit 30182ba

Please sign in to comment.