Skip to content

Commit 30182ba

Browse files
committed
feat: update regex splitter
1 parent 21e8571 commit 30182ba

File tree

8 files changed

+62
-7
lines changed

8 files changed

+62
-7
lines changed

semantic_chunkers/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
"ConsecutiveChunker",
1212
"CumulativeChunker",
1313
"StatisticalChunker",
14-
"BaseSplitter",
1514
"RegexSplitter",
15+
"BaseSplitter",
1616
]
1717

1818
__version__ = "0.0.8"

semantic_chunkers/chunkers/consecutive.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from semantic_chunkers.chunkers.base import BaseChunker
88
from semantic_chunkers.schema import Chunk
99
from semantic_chunkers.splitters.base import BaseSplitter
10-
from semantic_chunkers.splitters.sentence import RegexSplitter
10+
from semantic_chunkers.splitters.regex import RegexSplitter
1111

1212

1313
class ConsecutiveChunker(BaseChunker):

semantic_chunkers/chunkers/cumulative.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from semantic_chunkers.chunkers.base import BaseChunker
88
from semantic_chunkers.schema import Chunk
99
from semantic_chunkers.splitters.base import BaseSplitter
10-
from semantic_chunkers.splitters.sentence import RegexSplitter
10+
from semantic_chunkers.splitters.regex import RegexSplitter
1111

1212

1313
class CumulativeChunker(BaseChunker):

semantic_chunkers/chunkers/statistical.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from semantic_chunkers.chunkers.base import BaseChunker
1010
from semantic_chunkers.schema import Chunk
1111
from semantic_chunkers.splitters.base import BaseSplitter
12-
from semantic_chunkers.splitters.sentence import RegexSplitter
12+
from semantic_chunkers.splitters.regex import RegexSplitter
1313
from semantic_chunkers.utils.logger import logger
1414
from semantic_chunkers.utils.text import (
1515
async_retry_with_timeout,

semantic_chunkers/splitters/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from semantic_chunkers.splitters.base import BaseSplitter
2-
from semantic_chunkers.splitters.sentence import RegexSplitter
2+
from semantic_chunkers.splitters.regex import RegexSplitter
33

44
__all__ = [
55
"BaseSplitter",

semantic_chunkers/splitters/sentence.py renamed to semantic_chunkers/splitters/regex.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,18 @@ class RegexSplitter(BaseSplitter):
5252
"""
5353

5454
def __call__(self, doc: str) -> List[str]:
55-
sentences = regex.split(self.regex_pattern, doc, flags=regex.VERBOSE)
56-
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
55+
# Step 1: Split by \n\n
56+
chunks = doc.split("\n\n")
57+
sentences = []
58+
for chunk in chunks:
59+
# Step 2: Split by \n within each chunk
60+
sub_chunks = chunk.split("\n")
61+
for sub_chunk in sub_chunks:
62+
# Step 3: Split by regex pattern within each sub_chunk
63+
sub_sentences = regex.split(
64+
self.regex_pattern, sub_chunk, flags=regex.VERBOSE
65+
)
66+
for sentence in sub_sentences:
67+
if sentence.strip():
68+
sentences.append(sentence.strip())
5769
return sentences
File renamed without changes.

tests/unit/test_regex_splitter.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import unittest
2+
3+
from semantic_chunkers.splitters.regex import RegexSplitter
4+
5+
6+
class TestRegexSplitter(unittest.TestCase):
7+
def setUp(self):
8+
self.splitter = RegexSplitter()
9+
10+
def test_split_by_double_newline(self):
11+
doc = "This is the first paragraph.\n\nThis is the second paragraph."
12+
expected = ["This is the first paragraph.", "This is the second paragraph."]
13+
result = self.splitter(doc)
14+
self.assertEqual(result, expected)
15+
16+
def test_split_by_single_newline(self):
17+
doc = "This is the first line.\nThis is the second line."
18+
expected = ["This is the first line.", "This is the second line."]
19+
result = self.splitter(doc)
20+
self.assertEqual(result, expected)
21+
22+
def test_split_by_period(self):
23+
doc = "This is the first sentence. This is the second sentence."
24+
expected = ["This is the first sentence.", "This is the second sentence."]
25+
result = self.splitter(doc)
26+
self.assertEqual(result, expected)
27+
28+
def test_complex_split(self):
29+
doc = """
30+
First paragraph.\n\nSecond paragraph.\nThird line in second paragraph. Fourth line.\n\nFifth paragraph."""
31+
expected = [
32+
"First paragraph.",
33+
"Second paragraph.",
34+
"Third line in second paragraph.",
35+
"Fourth line.",
36+
"Fifth paragraph.",
37+
]
38+
result = self.splitter(doc)
39+
self.assertEqual(result, expected)
40+
41+
42+
if __name__ == "__main__":
43+
unittest.main()

0 commit comments

Comments
 (0)