From 39d9305aab5fda320afa616bed32505cbae04f54 Mon Sep 17 00:00:00 2001 From: valovtsov Date: Wed, 23 Jul 2025 13:08:04 +0300 Subject: [PATCH 1/2] counting tokens HeadlineSplitter --- .../testset/transforms/splitters/headline.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/ragas/src/ragas/testset/transforms/splitters/headline.py b/ragas/src/ragas/testset/transforms/splitters/headline.py index 8affc65e4..d9381d8c9 100644 --- a/ragas/src/ragas/testset/transforms/splitters/headline.py +++ b/ragas/src/ragas/testset/transforms/splitters/headline.py @@ -1,8 +1,13 @@ import typing as t from dataclasses import dataclass +import tiktoken from ragas.testset.graph import Node, NodeType, Relationship from ragas.testset.transforms.base import Splitter +from ragas.utils import num_tokens_from_string + + +DEFAULT_TOKENIZER = tiktoken.get_encoding("o200k_base") @dataclass @@ -15,27 +20,27 @@ def adjust_chunks(self, chunks): current_chunk = "" for chunk in chunks: - chunk_tokens = chunk.split() + chunk_tokens = DEFAULT_TOKENIZER.encode(chunk) # Split chunks that are over max_tokens while len(chunk_tokens) > self.max_tokens: - adjusted_chunks.append(" ".join(chunk_tokens[: self.max_tokens])) - chunk_tokens = chunk_tokens[self.max_tokens :] + adjusted_chunks.append(DEFAULT_TOKENIZER.decode(chunk_tokens[:self.max_tokens])) + chunk_tokens = chunk_tokens[self.max_tokens:] - # Handle chunks that are under min_tokens + chunk_str = DEFAULT_TOKENIZER.decode(chunk_tokens) if len(chunk_tokens) < self.min_tokens: if current_chunk: - current_chunk += " " + " ".join(chunk_tokens) - if len(current_chunk.split()) >= self.min_tokens: + current_chunk += " " + chunk_str + if num_tokens_from_string(current_chunk) >= self.min_tokens: adjusted_chunks.append(current_chunk) current_chunk = "" else: - current_chunk = " ".join(chunk_tokens) + current_chunk = chunk_str else: if current_chunk: adjusted_chunks.append(current_chunk) current_chunk = "" - adjusted_chunks.append(" ".join(chunk_tokens)) + adjusted_chunks.append(chunk_str) # Append any remaining chunk if current_chunk: From 3a3a482200e6a827e5be095cb04072075ce757ca Mon Sep 17 00:00:00 2001 From: valovtsov Date: Wed, 23 Jul 2025 13:28:21 +0300 Subject: [PATCH 2/2] tokens counting split and adjust_chunks --- ragas/src/ragas/testset/transforms/splitters/headline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ragas/src/ragas/testset/transforms/splitters/headline.py b/ragas/src/ragas/testset/transforms/splitters/headline.py index d9381d8c9..694130b23 100644 --- a/ragas/src/ragas/testset/transforms/splitters/headline.py +++ b/ragas/src/ragas/testset/transforms/splitters/headline.py @@ -31,7 +31,7 @@ def adjust_chunks(self, chunks): if len(chunk_tokens) < self.min_tokens: if current_chunk: current_chunk += " " + chunk_str - if num_tokens_from_string(current_chunk) >= self.min_tokens: + if num_tokens_from_string(current_chunk, encoding_name=DEFAULT_TOKENIZER.name) >= self.min_tokens: adjusted_chunks.append(current_chunk) current_chunk = "" else: @@ -57,7 +57,7 @@ async def split(self, node: Node) -> t.Tuple[t.List[Node], t.List[Relationship]] if headlines is None: raise ValueError("'headlines' property not found in this node") - if len(text.split()) < self.min_tokens: + if num_tokens_from_string(text, encoding_name=DEFAULT_TOKENIZER.name) < self.min_tokens: return [node], [] # create the chunks for the different sections indices = [0]