From 19e4ff270907100585be016ec4032069d6a536bd Mon Sep 17 00:00:00 2001
From: questcollector <miroirs01@gmail.com>
Date: Fri, 4 Jul 2025 16:02:35 +0900
Subject: [PATCH] fix: add disallowed_special  on tiktoken encode

---
 src/ragas/testset/transforms/base.py | 4 +++-
 src/ragas/utils.py                   | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/src/ragas/testset/transforms/base.py b/src/ragas/testset/transforms/base.py
index 6f5463571..d27ffd914 100644
--- a/src/ragas/testset/transforms/base.py
+++ b/src/ragas/testset/transforms/base.py
@@ -198,7 +198,9 @@ class LLMBasedExtractor(Extractor, PromptMixin):
 
     def split_text_by_token_limit(self, text, max_token_limit):
         # Tokenize the entire input string
-        tokens = self.tokenizer.encode(text)
+        # to prevent error case when document has special tokens like `<endoftext>`
+        # set empty tuple in disallowed_special to allow all special tokens
+        tokens = self.tokenizer.encode(text, disallowed_special=())
 
         # Split tokens into chunks of max_token_limit or less
         chunks = []
diff --git a/src/ragas/utils.py b/src/ragas/utils.py
index ca8962995..b89328b48 100644
--- a/src/ragas/utils.py
+++ b/src/ragas/utils.py
@@ -230,7 +230,9 @@ def camel_to_snake(name):
 def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
     """Returns the number of tokens in a text string."""
     encoding = tiktoken.get_encoding(encoding_name)
-    num_tokens = len(encoding.encode(string))
+    # to prevent error case when document has special tokens like `<endoftext>`
+    # set empty tuple in disallowed_special to allow all special tokens
+    num_tokens = len(encoding.encode(string, disallowed_special=()))
     return num_tokens