diff --git a/mlx_lm/tokenizer_utils.py b/mlx_lm/tokenizer_utils.py index c7e50fbe7..a10d7cb55 100644 --- a/mlx_lm/tokenizer_utils.py +++ b/mlx_lm/tokenizer_utils.py @@ -358,7 +358,7 @@ def add_eos_token(self, token: str): self._eos_token_ids.add(token_id) def _find(self, tokens, sequence, start=None, end=None, reverse=False): - start = start or 0 + start = max(start or 0, 0) end = end or len(tokens) outer_loop = ( range(end - len(sequence), start - 1, -1) diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py index 54906af1c..cce4dd6a3 100644 --- a/tests/test_tokenizers.py +++ b/tests/test_tokenizers.py @@ -109,6 +109,18 @@ def test_thinking(self): self.assertIsNone(tokenizer.think_start_id) self.assertIsNone(tokenizer.think_end_id) + def test_think_search_short_prompt_negative_start(self): + # Regression for #1326: the server computes start = len(prompt) - 11 and passes + # it to rfind_think_start. For prompts shorter than 11 tokens that start is + # negative; the think-token search must clamp it instead of indexing past the + # token list (IndexError, which the server surfaces as a 404). + tokenizer = load_tokenizer("mlx-community/Qwen3-4B-4bit") + self.assertTrue(tokenizer.has_thinking) + short_tokens = [0, 1, 2] # 3-token prompt with no think-start present + start = len(short_tokens) - 11 # = -8, exactly what server.py passes + self.assertEqual(tokenizer.rfind_think_start(short_tokens, start=start), -1) + self.assertEqual(tokenizer.find_think_start(short_tokens, start=start), -1) + if __name__ == "__main__": unittest.main()