vectorize-io · connorblack · May 13, 2026 · May 13, 2026
diff --git a/hindsight-api-slim/hindsight_api/config.py b/hindsight-api-slim/hindsight_api/config.py
@@ -168,6 +168,7 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]:
 ENV_RETAIN_LLM_MAX_BACKOFF = "HINDSIGHT_API_RETAIN_LLM_MAX_BACKOFF"
 ENV_RETAIN_LLM_TIMEOUT = "HINDSIGHT_API_RETAIN_LLM_TIMEOUT"
 ENV_RETAIN_LLM_LITELLMROUTER_CONFIG = "HINDSIGHT_API_RETAIN_LLM_LITELLMROUTER_CONFIG"
+ENV_RETAIN_LLM_EXTRA_BODY = "HINDSIGHT_API_RETAIN_LLM_EXTRA_BODY"
 
 ENV_REFLECT_LLM_PROVIDER = "HINDSIGHT_API_REFLECT_LLM_PROVIDER"
 ENV_REFLECT_LLM_API_KEY = "HINDSIGHT_API_REFLECT_LLM_API_KEY"
@@ -179,6 +180,7 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]:
 ENV_REFLECT_LLM_MAX_BACKOFF = "HINDSIGHT_API_REFLECT_LLM_MAX_BACKOFF"
 ENV_REFLECT_LLM_TIMEOUT = "HINDSIGHT_API_REFLECT_LLM_TIMEOUT"
 ENV_REFLECT_LLM_LITELLMROUTER_CONFIG = "HINDSIGHT_API_REFLECT_LLM_LITELLMROUTER_CONFIG"
+ENV_REFLECT_LLM_EXTRA_BODY = "HINDSIGHT_API_REFLECT_LLM_EXTRA_BODY"
 
 ENV_CONSOLIDATION_LLM_PROVIDER = "HINDSIGHT_API_CONSOLIDATION_LLM_PROVIDER"
 ENV_CONSOLIDATION_LLM_API_KEY = "HINDSIGHT_API_CONSOLIDATION_LLM_API_KEY"
@@ -190,6 +192,7 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]:
 ENV_CONSOLIDATION_LLM_MAX_BACKOFF = "HINDSIGHT_API_CONSOLIDATION_LLM_MAX_BACKOFF"
 ENV_CONSOLIDATION_LLM_TIMEOUT = "HINDSIGHT_API_CONSOLIDATION_LLM_TIMEOUT"
 ENV_CONSOLIDATION_LLM_LITELLMROUTER_CONFIG = "HINDSIGHT_API_CONSOLIDATION_LLM_LITELLMROUTER_CONFIG"
+ENV_CONSOLIDATION_LLM_EXTRA_BODY = "HINDSIGHT_API_CONSOLIDATION_LLM_EXTRA_BODY"
 
 ENV_EMBEDDINGS_PROVIDER = "HINDSIGHT_API_EMBEDDINGS_PROVIDER"
 ENV_EMBEDDINGS_LOCAL_MODEL = "HINDSIGHT_API_EMBEDDINGS_LOCAL_MODEL"
@@ -927,6 +930,7 @@ class HindsightConfig:
     retain_llm_max_backoff: float | None
     retain_llm_timeout: float | None
     retain_llm_litellmrouter_config: dict | None
+    retain_llm_extra_body: dict | None
 
     reflect_llm_provider: str | None
     reflect_llm_api_key: str | None
@@ -938,6 +942,7 @@ class HindsightConfig:
     reflect_llm_max_backoff: float | None
     reflect_llm_timeout: float | None
     reflect_llm_litellmrouter_config: dict | None
+    reflect_llm_extra_body: dict | None
 
     consolidation_llm_provider: str | None
     consolidation_llm_api_key: str | None
@@ -949,6 +954,7 @@ class HindsightConfig:
     consolidation_llm_max_backoff: float | None
     consolidation_llm_timeout: float | None
     consolidation_llm_litellmrouter_config: dict | None
+    consolidation_llm_extra_body: dict | None
 
     # Embeddings
     embeddings_provider: str
@@ -1469,6 +1475,9 @@ def from_env(cls) -> "HindsightConfig":
             else None,
             retain_llm_timeout=float(os.getenv(ENV_RETAIN_LLM_TIMEOUT)) if os.getenv(ENV_RETAIN_LLM_TIMEOUT) else None,
             retain_llm_litellmrouter_config=_parse_llm_router_config(ENV_RETAIN_LLM_LITELLMROUTER_CONFIG),
+            retain_llm_extra_body=json.loads(os.getenv(ENV_RETAIN_LLM_EXTRA_BODY))
+            if os.getenv(ENV_RETAIN_LLM_EXTRA_BODY)
+            else None,
             reflect_llm_provider=os.getenv(ENV_REFLECT_LLM_PROVIDER) or None,
             reflect_llm_api_key=os.getenv(ENV_REFLECT_LLM_API_KEY) or None,
             reflect_llm_model=os.getenv(ENV_REFLECT_LLM_MODEL)
@@ -1494,6 +1503,9 @@ def from_env(cls) -> "HindsightConfig":
             if os.getenv(ENV_REFLECT_LLM_TIMEOUT)
             else None,
             reflect_llm_litellmrouter_config=_parse_llm_router_config(ENV_REFLECT_LLM_LITELLMROUTER_CONFIG),
+            reflect_llm_extra_body=json.loads(os.getenv(ENV_REFLECT_LLM_EXTRA_BODY))
+            if os.getenv(ENV_REFLECT_LLM_EXTRA_BODY)
+            else None,
             consolidation_llm_provider=os.getenv(ENV_CONSOLIDATION_LLM_PROVIDER) or None,
             consolidation_llm_api_key=os.getenv(ENV_CONSOLIDATION_LLM_API_KEY) or None,
             consolidation_llm_model=os.getenv(ENV_CONSOLIDATION_LLM_MODEL)
@@ -1519,6 +1531,9 @@ def from_env(cls) -> "HindsightConfig":
             if os.getenv(ENV_CONSOLIDATION_LLM_TIMEOUT)
             else None,
             consolidation_llm_litellmrouter_config=_parse_llm_router_config(ENV_CONSOLIDATION_LLM_LITELLMROUTER_CONFIG),
+            consolidation_llm_extra_body=json.loads(os.getenv(ENV_CONSOLIDATION_LLM_EXTRA_BODY))
+            if os.getenv(ENV_CONSOLIDATION_LLM_EXTRA_BODY)
+            else None,
             # Embeddings
             embeddings_provider=os.getenv(ENV_EMBEDDINGS_PROVIDER, DEFAULT_EMBEDDINGS_PROVIDER),
             embeddings_local_model=os.getenv(ENV_EMBEDDINGS_LOCAL_MODEL, DEFAULT_EMBEDDINGS_LOCAL_MODEL),

diff --git a/hindsight-api-slim/hindsight_api/engine/memory_engine.py b/hindsight-api-slim/hindsight_api/engine/memory_engine.py
@@ -573,7 +573,7 @@ def __init__(
             api_key=retain_api_key,
             base_url=retain_base_url,
             model=retain_model,
-            extra_body=config.llm_extra_body,
+            extra_body=config.retain_llm_extra_body or config.llm_extra_body,
             default_headers=config.llm_default_headers,
             litellmrouter_config=config.retain_llm_litellmrouter_config or config.llm_litellmrouter_config,
         )
@@ -597,7 +597,7 @@ def __init__(
             api_key=reflect_api_key,
             base_url=reflect_base_url,
             model=reflect_model,
-            extra_body=config.llm_extra_body,
+            extra_body=config.reflect_llm_extra_body or config.llm_extra_body,
             default_headers=config.llm_default_headers,
             litellmrouter_config=config.reflect_llm_litellmrouter_config or config.llm_litellmrouter_config,
         )
@@ -621,7 +621,7 @@ def __init__(
             api_key=consolidation_api_key,
             base_url=consolidation_base_url,
             model=consolidation_model,
-            extra_body=config.llm_extra_body,
+            extra_body=config.consolidation_llm_extra_body or config.llm_extra_body,
             default_headers=config.llm_default_headers,
             litellmrouter_config=config.consolidation_llm_litellmrouter_config or config.llm_litellmrouter_config,
         )

diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py
@@ -10,6 +10,7 @@
 import asyncio
 import json
 import logging
+import os
 import re
 import time
 from typing import TYPE_CHECKING, Any, Awaitable, Callable
@@ -562,7 +563,22 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
         if include_recall:
             forced_sequence.append("recall")
 
-        if iteration < len(forced_sequence):
+        # Escape hatch for inference engines whose tool-call parser breaks
+        # when tool_choice forces a specific function name. vLLM (as of v0.20.2
+        # / nightly 2026-05-13) has open issue #35936 + #33965: forced
+        # tool_choice ("required" and named-function modes) bypasses the
+        # configured --tool-call-parser and uses JSON-only validation. For
+        # models that emit XML-style tool calls (Qwen3 family with qwen3_coder
+        # parser), this silently returns tool_calls=[] while finish_reason
+        # still reports "tool_calls". Set
+        # HINDSIGHT_API_REFLECT_DISABLE_FORCED_TOOL_CHOICE=true to fall back
+        # to tool_choice="auto" for every iteration; the model still calls
+        # retrieval tools when given factual queries, just without API-level
+        # forcing.
+        disable_forced = os.getenv(
+            "HINDSIGHT_API_REFLECT_DISABLE_FORCED_TOOL_CHOICE", ""
+        ).lower() in ("true", "1", "yes")
+        if not disable_forced and iteration < len(forced_sequence):
             iter_tool_choice: str | dict = {"type": "function", "function": {"name": forced_sequence[iteration]}}
         else:
             iter_tool_choice = "auto"

diff --git a/hindsight-api-slim/tests/test_per_scope_llm_extra_body.py b/hindsight-api-slim/tests/test_per_scope_llm_extra_body.py
@@ -0,0 +1,104 @@
+"""Tests for per-scope ``HINDSIGHT_API_<SCOPE>_LLM_EXTRA_BODY`` env vars.
+
+Verifies:
+- Each scope's extra_body env var parses as JSON into the Config dataclass.
+- Per-scope value beats the global ``HINDSIGHT_API_LLM_EXTRA_BODY`` when set.
+- Unset per-scope value falls back to the global.
+- Unset global + unset per-scope = None (no extra_body merged at call time).
+"""
+
+import json
+import os
+
+import pytest
+
+
+@pytest.fixture
+def reset_env_extra_body():
+    """Snapshot + restore all extra_body env vars around each test."""
+    from hindsight_api.config import clear_config_cache
+
+    keys = [
+        "HINDSIGHT_API_LLM_EXTRA_BODY",
+        "HINDSIGHT_API_RETAIN_LLM_EXTRA_BODY",
+        "HINDSIGHT_API_REFLECT_LLM_EXTRA_BODY",
+        "HINDSIGHT_API_CONSOLIDATION_LLM_EXTRA_BODY",
+        # required to make get_config() succeed without a real LLM provider
+        "HINDSIGHT_API_SKIP_LLM_VERIFICATION",
+        "HINDSIGHT_API_LAZY_RERANKER",
+        "HINDSIGHT_API_LLM_PROVIDER",
+        "HINDSIGHT_API_LLM_MODEL",
+    ]
+    saved = {k: os.environ.get(k) for k in keys}
+    os.environ["HINDSIGHT_API_SKIP_LLM_VERIFICATION"] = "true"
+    os.environ["HINDSIGHT_API_LAZY_RERANKER"] = "true"
+    os.environ["HINDSIGHT_API_LLM_PROVIDER"] = "mock"
+    os.environ["HINDSIGHT_API_LLM_MODEL"] = "default-model"
+    # Per-scope vars start unset; tests opt-in by setting them explicitly.
+    for k in [
+        "HINDSIGHT_API_LLM_EXTRA_BODY",
+        "HINDSIGHT_API_RETAIN_LLM_EXTRA_BODY",
+        "HINDSIGHT_API_REFLECT_LLM_EXTRA_BODY",
+        "HINDSIGHT_API_CONSOLIDATION_LLM_EXTRA_BODY",
+    ]:
+        os.environ.pop(k, None)
+    clear_config_cache()
+    yield
+    for k, v in saved.items():
+        if v is None:
+            os.environ.pop(k, None)
+        else:
+            os.environ[k] = v
+    clear_config_cache()
+
+
+class TestPerScopeLLMExtraBody:
+    def test_unset_yields_none_on_all_scopes(self, reset_env_extra_body):
+        from hindsight_api.config import get_config
+
+        config = get_config()
+        assert config.llm_extra_body is None
+        assert config.retain_llm_extra_body is None
+        assert config.reflect_llm_extra_body is None
+        assert config.consolidation_llm_extra_body is None
+
+    def test_per_scope_env_parses_as_json(self, reset_env_extra_body):
+        from hindsight_api.config import clear_config_cache, get_config
+
+        os.environ["HINDSIGHT_API_RETAIN_LLM_EXTRA_BODY"] = json.dumps(
+            {"temperature": 0.6, "top_p": 0.8}
+        )
+        os.environ["HINDSIGHT_API_REFLECT_LLM_EXTRA_BODY"] = json.dumps(
+            {"chat_template_kwargs": {"enable_thinking": True}}
+        )
+        os.environ["HINDSIGHT_API_CONSOLIDATION_LLM_EXTRA_BODY"] = json.dumps(
+            {"chat_template_kwargs": {"enable_thinking": False}, "presence_penalty": 0.0}
+        )
+        clear_config_cache()
+        config = get_config()
+        assert config.retain_llm_extra_body == {"temperature": 0.6, "top_p": 0.8}
+        assert config.reflect_llm_extra_body == {"chat_template_kwargs": {"enable_thinking": True}}
+        assert config.consolidation_llm_extra_body == {
+            "chat_template_kwargs": {"enable_thinking": False},
+            "presence_penalty": 0.0,
+        }
+
+    def test_global_only_leaves_per_scope_none(self, reset_env_extra_body):
+        from hindsight_api.config import clear_config_cache, get_config
+
+        os.environ["HINDSIGHT_API_LLM_EXTRA_BODY"] = json.dumps({"temperature": 0.7})
+        clear_config_cache()
+        config = get_config()
+        assert config.llm_extra_body == {"temperature": 0.7}
+        # Per-scope fields stay None — fallback happens at engine init, not parse time.
+        assert config.retain_llm_extra_body is None
+        assert config.reflect_llm_extra_body is None
+        assert config.consolidation_llm_extra_body is None
+
+    def test_invalid_json_raises_at_parse_time(self, reset_env_extra_body):
+        from hindsight_api.config import clear_config_cache, get_config
+
+        os.environ["HINDSIGHT_API_RETAIN_LLM_EXTRA_BODY"] = "{not valid json"
+        clear_config_cache()
+        with pytest.raises(json.JSONDecodeError):
+            get_config()