From 19e4ff270907100585be016ec4032069d6a536bd Mon Sep 17 00:00:00 2001 From: questcollector Date: Fri, 4 Jul 2025 16:02:35 +0900 Subject: [PATCH] fix: add disallowed_special on tiktoken encode --- src/ragas/testset/transforms/base.py | 4 +++- src/ragas/utils.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/ragas/testset/transforms/base.py b/src/ragas/testset/transforms/base.py index 6f5463571..d27ffd914 100644 --- a/src/ragas/testset/transforms/base.py +++ b/src/ragas/testset/transforms/base.py @@ -198,7 +198,9 @@ class LLMBasedExtractor(Extractor, PromptMixin): def split_text_by_token_limit(self, text, max_token_limit): # Tokenize the entire input string - tokens = self.tokenizer.encode(text) + # to prevent error case when document has special tokens like `` + # set empty tuple in disallowed_special to allow all special tokens + tokens = self.tokenizer.encode(text, disallowed_special=()) # Split tokens into chunks of max_token_limit or less chunks = [] diff --git a/src/ragas/utils.py b/src/ragas/utils.py index ca8962995..b89328b48 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -230,7 +230,9 @@ def camel_to_snake(name): def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int: """Returns the number of tokens in a text string.""" encoding = tiktoken.get_encoding(encoding_name) - num_tokens = len(encoding.encode(string)) + # to prevent error case when document has special tokens like `` + # set empty tuple in disallowed_special to allow all special tokens + num_tokens = len(encoding.encode(string, disallowed_special=())) return num_tokens