From 1696421e6e299bb254539cbc3ab08ae77da71f81 Mon Sep 17 00:00:00 2001 From: Ismail Ashraq Date: Mon, 27 May 2024 13:46:29 +0500 Subject: [PATCH 1/5] splitters module --- semantic_chunkers/chunkers/base.py | 6 ++++-- semantic_chunkers/chunkers/consecutive.py | 5 ++++- semantic_chunkers/chunkers/cumulative.py | 5 ++++- semantic_chunkers/chunkers/statistical.py | 5 ++++- semantic_chunkers/splitters/sentence.py | 18 ++++++++---------- 5 files changed, 24 insertions(+), 15 deletions(-) diff --git a/semantic_chunkers/chunkers/base.py b/semantic_chunkers/chunkers/base.py index 7517c61..4f3e47d 100644 --- a/semantic_chunkers/chunkers/base.py +++ b/semantic_chunkers/chunkers/base.py @@ -5,12 +5,14 @@ from semantic_router.encoders.base import BaseEncoder from semantic_chunkers.schema import Chunk -from semantic_chunkers.splitters.sentence import regex_splitter +from semantic_chunkers.splitters.base import BaseSplitter +from semantic_chunkers.splitters.sentence import RegexSplitter class BaseChunker(BaseModel): name: str encoder: BaseEncoder + splitter: BaseSplitter class Config: extra = Extra.allow @@ -19,7 +21,7 @@ def __call__(self, docs: List[str]) -> List[List[Chunk]]: raise NotImplementedError("Subclasses must implement this method") def _split(self, doc: str) -> List[str]: - return regex_splitter(doc) + return self.splitter(doc) def _chunk(self, splits: List[Any]) -> List[Chunk]: raise NotImplementedError("Subclasses must implement this method") diff --git a/semantic_chunkers/chunkers/consecutive.py b/semantic_chunkers/chunkers/consecutive.py index 91d7134..1b5664f 100644 --- a/semantic_chunkers/chunkers/consecutive.py +++ b/semantic_chunkers/chunkers/consecutive.py @@ -6,6 +6,8 @@ from semantic_router.encoders.base import BaseEncoder from semantic_chunkers.schema import Chunk from semantic_chunkers.chunkers.base import BaseChunker +from semantic_chunkers.splitters.base import BaseSplitter +from semantic_chunkers.splitters.sentence import RegexSplitter class ConsecutiveChunker(BaseChunker): @@ -16,10 +18,11 @@ class ConsecutiveChunker(BaseChunker): def __init__( self, encoder: BaseEncoder, + splitter: BaseSplitter = RegexSplitter(), name: str = "consecutive_chunker", score_threshold: float = 0.45, ): - super().__init__(name=name, encoder=encoder) + super().__init__(name=name, encoder=encoder, splitter=splitter) encoder.score_threshold = score_threshold self.score_threshold = score_threshold diff --git a/semantic_chunkers/chunkers/cumulative.py b/semantic_chunkers/chunkers/cumulative.py index 8987b04..973952e 100644 --- a/semantic_chunkers/chunkers/cumulative.py +++ b/semantic_chunkers/chunkers/cumulative.py @@ -6,6 +6,8 @@ from semantic_router.encoders import BaseEncoder from semantic_chunkers.schema import Chunk from semantic_chunkers.chunkers.base import BaseChunker +from semantic_chunkers.splitters.base import BaseSplitter +from semantic_chunkers.splitters.sentence import RegexSplitter class CumulativeChunker(BaseChunker): @@ -17,10 +19,11 @@ class CumulativeChunker(BaseChunker): def __init__( self, encoder: BaseEncoder, + splitter: BaseSplitter = RegexSplitter(), name: str = "cumulative_chunker", score_threshold: float = 0.45, ): - super().__init__(name=name, encoder=encoder) + super().__init__(name=name, encoder=encoder, splitter=splitter) encoder.score_threshold = score_threshold self.score_threshold = score_threshold diff --git a/semantic_chunkers/chunkers/statistical.py b/semantic_chunkers/chunkers/statistical.py index a6997ba..808414e 100644 --- a/semantic_chunkers/chunkers/statistical.py +++ b/semantic_chunkers/chunkers/statistical.py @@ -6,6 +6,8 @@ from semantic_router.encoders.base import BaseEncoder from semantic_chunkers.schema import Chunk from semantic_chunkers.chunkers.base import BaseChunker +from semantic_chunkers.splitters.base import BaseSplitter +from semantic_chunkers.splitters.sentence import RegexSplitter from semantic_chunkers.utils.text import tiktoken_length from semantic_chunkers.utils.logger import logger @@ -39,6 +41,7 @@ class StatisticalChunker(BaseChunker): def __init__( self, encoder: BaseEncoder, + splitter: BaseSplitter = RegexSplitter(), name="statistical_chunker", threshold_adjustment=0.01, dynamic_threshold: bool = True, @@ -49,7 +52,7 @@ def __init__( plot_chunks=False, enable_statistics=False, ): - super().__init__(name=name, encoder=encoder) + super().__init__(name=name, encoder=encoder, splitter=splitter) self.calculated_threshold: float self.encoder = encoder self.threshold_adjustment = threshold_adjustment diff --git a/semantic_chunkers/splitters/sentence.py b/semantic_chunkers/splitters/sentence.py index 9e75adc..343f525 100644 --- a/semantic_chunkers/splitters/sentence.py +++ b/semantic_chunkers/splitters/sentence.py @@ -1,7 +1,8 @@ import regex +from semantic_chunkers.splitters.base import BaseSplitter -def regex_splitter(text: str) -> list[str]: +class RegexSplitter(BaseSplitter): """ Enhanced regex pattern to split a given text into sentences more accurately. @@ -11,12 +12,6 @@ def regex_splitter(text: str) -> list[str]: - Decimal numbers and dates. - Ellipses and other punctuation marks used in informal text. - Removing control characters and format characters. - - Args: - text (str): The text to split into sentences. - - Returns: - list: A list of sentences extracted from the text. """ regex_pattern = r""" # Negative lookbehind for word boundary, word char, dot, word char @@ -51,6 +46,9 @@ def regex_splitter(text: str) -> list[str]: # Matches and removes control characters and format characters [\p{Cc}\p{Cf}]+ """ - sentences = regex.split(regex_pattern, text, flags=regex.VERBOSE) - sentences = [sentence.strip() for sentence in sentences if sentence.strip()] - return sentences + + def __call__(self, doc: str) -> list[str]: + print("Using regex splitter..") + sentences = regex.split(self.regex_pattern, doc, flags=regex.VERBOSE) + sentences = [sentence.strip() for sentence in sentences if sentence.strip()] + return sentences From 942b3ece8f61b3bd6ea69208a44172c3eb37e593 Mon Sep 17 00:00:00 2001 From: Ismail Ashraq Date: Mon, 27 May 2024 14:01:08 +0500 Subject: [PATCH 2/5] linting --- semantic_chunkers/chunkers/base.py | 1 - semantic_chunkers/splitters/sentence.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/semantic_chunkers/chunkers/base.py b/semantic_chunkers/chunkers/base.py index 4f3e47d..4351e57 100644 --- a/semantic_chunkers/chunkers/base.py +++ b/semantic_chunkers/chunkers/base.py @@ -6,7 +6,6 @@ from semantic_router.encoders.base import BaseEncoder from semantic_chunkers.schema import Chunk from semantic_chunkers.splitters.base import BaseSplitter -from semantic_chunkers.splitters.sentence import RegexSplitter class BaseChunker(BaseModel): diff --git a/semantic_chunkers/splitters/sentence.py b/semantic_chunkers/splitters/sentence.py index 343f525..10f2b05 100644 --- a/semantic_chunkers/splitters/sentence.py +++ b/semantic_chunkers/splitters/sentence.py @@ -13,6 +13,7 @@ class RegexSplitter(BaseSplitter): - Ellipses and other punctuation marks used in informal text. - Removing control characters and format characters. """ + regex_pattern = r""" # Negative lookbehind for word boundary, word char, dot, word char (? Date: Mon, 27 May 2024 14:03:32 +0500 Subject: [PATCH 3/5] remove print statement --- semantic_chunkers/splitters/sentence.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/semantic_chunkers/splitters/sentence.py b/semantic_chunkers/splitters/sentence.py index 10f2b05..cd8b2b3 100644 --- a/semantic_chunkers/splitters/sentence.py +++ b/semantic_chunkers/splitters/sentence.py @@ -1,4 +1,6 @@ import regex +from typing import List + from semantic_chunkers.splitters.base import BaseSplitter @@ -48,8 +50,7 @@ class RegexSplitter(BaseSplitter): [\p{Cc}\p{Cf}]+ """ - def __call__(self, doc: str) -> list[str]: - print("Using regex splitter..") + def __call__(self, doc: str) -> List[str]: sentences = regex.split(self.regex_pattern, doc, flags=regex.VERBOSE) sentences = [sentence.strip() for sentence in sentences if sentence.strip()] return sentences From f1276243dcf5256c9355507cc46bec9c83e6fa1b Mon Sep 17 00:00:00 2001 From: Ismail Ashraq Date: Mon, 27 May 2024 14:15:36 +0500 Subject: [PATCH 4/5] fix pytests --- semantic_chunkers/__init__.py | 13 +++++++++---- semantic_chunkers/splitters/__init__.py | 8 ++++++++ tests/unit/test_splitters.py | 9 ++++++++- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/semantic_chunkers/__init__.py b/semantic_chunkers/__init__.py index 560e3a2..f41c50f 100644 --- a/semantic_chunkers/__init__.py +++ b/semantic_chunkers/__init__.py @@ -1,13 +1,18 @@ -from semantic_chunkers.chunkers import BaseChunker -from semantic_chunkers.chunkers import ConsecutiveChunker -from semantic_chunkers.chunkers import CumulativeChunker -from semantic_chunkers.chunkers import StatisticalChunker +from semantic_chunkers.chunkers import ( + BaseChunker, + ConsecutiveChunker, + CumulativeChunker, + StatisticalChunker, +) +from semantic_chunkers.splitters import BaseSplitter, RegexSplitter __all__ = [ "BaseChunker", "ConsecutiveChunker", "CumulativeChunker", "StatisticalChunker", + "BaseSplitter", + "RegexSplitter", ] __version__ = "0.0.5" diff --git a/semantic_chunkers/splitters/__init__.py b/semantic_chunkers/splitters/__init__.py index e69de29..c6d858a 100644 --- a/semantic_chunkers/splitters/__init__.py +++ b/semantic_chunkers/splitters/__init__.py @@ -0,0 +1,8 @@ +from semantic_chunkers.splitters.base import BaseSplitter +from semantic_chunkers.splitters.sentence import RegexSplitter + + +__all__ = [ + "BaseSplitter", + "RegexSplitter", +] diff --git a/tests/unit/test_splitters.py b/tests/unit/test_splitters.py index 8584dbb..b37e891 100644 --- a/tests/unit/test_splitters.py +++ b/tests/unit/test_splitters.py @@ -6,6 +6,7 @@ from semantic_router.encoders.base import BaseEncoder from semantic_router.encoders.cohere import CohereEncoder from semantic_chunkers import BaseChunker +from semantic_chunkers import BaseSplitter from semantic_chunkers import ConsecutiveChunker from semantic_chunkers import CumulativeChunker @@ -106,7 +107,13 @@ def base_splitter_instance(): mock_encoder = Mock(spec=BaseEncoder) mock_encoder.name = "mock_encoder" mock_encoder.score_threshold = 0.5 - return BaseChunker(name="test_splitter", encoder=mock_encoder, score_threshold=0.5) + mock_splitter = Mock(spec=BaseSplitter) + return BaseChunker( + name="test_splitter", + encoder=mock_encoder, + splitter=mock_splitter, + score_threshold=0.5, + ) def test_base_splitter_call_not_implemented(base_splitter_instance): From 49accb88b80b60bc62d0b57f808ed4812ef3bdfe Mon Sep 17 00:00:00 2001 From: Ismail Ashraq Date: Mon, 27 May 2024 14:20:29 +0500 Subject: [PATCH 5/5] base splitter --- semantic_chunkers/splitters/base.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 semantic_chunkers/splitters/base.py diff --git a/semantic_chunkers/splitters/base.py b/semantic_chunkers/splitters/base.py new file mode 100644 index 0000000..7969c45 --- /dev/null +++ b/semantic_chunkers/splitters/base.py @@ -0,0 +1,11 @@ +from typing import List + +from pydantic.v1 import BaseModel, Extra + + +class BaseSplitter(BaseModel): + class Config: + extra = Extra.allow + + def __call__(self, doc: str) -> List[str]: + raise NotImplementedError("Subclasses must implement this method")