diff --git a/hindsight-api-slim/hindsight_api/config.py b/hindsight-api-slim/hindsight_api/config.py index fd5f88652..dca31d637 100644 --- a/hindsight-api-slim/hindsight_api/config.py +++ b/hindsight-api-slim/hindsight_api/config.py @@ -168,6 +168,7 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]: ENV_RETAIN_LLM_MAX_BACKOFF = "HINDSIGHT_API_RETAIN_LLM_MAX_BACKOFF" ENV_RETAIN_LLM_TIMEOUT = "HINDSIGHT_API_RETAIN_LLM_TIMEOUT" ENV_RETAIN_LLM_LITELLMROUTER_CONFIG = "HINDSIGHT_API_RETAIN_LLM_LITELLMROUTER_CONFIG" +ENV_RETAIN_LLM_EXTRA_BODY = "HINDSIGHT_API_RETAIN_LLM_EXTRA_BODY" ENV_REFLECT_LLM_PROVIDER = "HINDSIGHT_API_REFLECT_LLM_PROVIDER" ENV_REFLECT_LLM_API_KEY = "HINDSIGHT_API_REFLECT_LLM_API_KEY" @@ -179,6 +180,7 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]: ENV_REFLECT_LLM_MAX_BACKOFF = "HINDSIGHT_API_REFLECT_LLM_MAX_BACKOFF" ENV_REFLECT_LLM_TIMEOUT = "HINDSIGHT_API_REFLECT_LLM_TIMEOUT" ENV_REFLECT_LLM_LITELLMROUTER_CONFIG = "HINDSIGHT_API_REFLECT_LLM_LITELLMROUTER_CONFIG" +ENV_REFLECT_LLM_EXTRA_BODY = "HINDSIGHT_API_REFLECT_LLM_EXTRA_BODY" ENV_CONSOLIDATION_LLM_PROVIDER = "HINDSIGHT_API_CONSOLIDATION_LLM_PROVIDER" ENV_CONSOLIDATION_LLM_API_KEY = "HINDSIGHT_API_CONSOLIDATION_LLM_API_KEY" @@ -190,6 +192,7 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]: ENV_CONSOLIDATION_LLM_MAX_BACKOFF = "HINDSIGHT_API_CONSOLIDATION_LLM_MAX_BACKOFF" ENV_CONSOLIDATION_LLM_TIMEOUT = "HINDSIGHT_API_CONSOLIDATION_LLM_TIMEOUT" ENV_CONSOLIDATION_LLM_LITELLMROUTER_CONFIG = "HINDSIGHT_API_CONSOLIDATION_LLM_LITELLMROUTER_CONFIG" +ENV_CONSOLIDATION_LLM_EXTRA_BODY = "HINDSIGHT_API_CONSOLIDATION_LLM_EXTRA_BODY" ENV_EMBEDDINGS_PROVIDER = "HINDSIGHT_API_EMBEDDINGS_PROVIDER" ENV_EMBEDDINGS_LOCAL_MODEL = "HINDSIGHT_API_EMBEDDINGS_LOCAL_MODEL" @@ -927,6 +930,7 @@ class HindsightConfig: retain_llm_max_backoff: float | None retain_llm_timeout: float | None retain_llm_litellmrouter_config: dict | None + retain_llm_extra_body: dict | None reflect_llm_provider: str | None reflect_llm_api_key: str | None @@ -938,6 +942,7 @@ class HindsightConfig: reflect_llm_max_backoff: float | None reflect_llm_timeout: float | None reflect_llm_litellmrouter_config: dict | None + reflect_llm_extra_body: dict | None consolidation_llm_provider: str | None consolidation_llm_api_key: str | None @@ -949,6 +954,7 @@ class HindsightConfig: consolidation_llm_max_backoff: float | None consolidation_llm_timeout: float | None consolidation_llm_litellmrouter_config: dict | None + consolidation_llm_extra_body: dict | None # Embeddings embeddings_provider: str @@ -1469,6 +1475,9 @@ def from_env(cls) -> "HindsightConfig": else None, retain_llm_timeout=float(os.getenv(ENV_RETAIN_LLM_TIMEOUT)) if os.getenv(ENV_RETAIN_LLM_TIMEOUT) else None, retain_llm_litellmrouter_config=_parse_llm_router_config(ENV_RETAIN_LLM_LITELLMROUTER_CONFIG), + retain_llm_extra_body=json.loads(os.getenv(ENV_RETAIN_LLM_EXTRA_BODY)) + if os.getenv(ENV_RETAIN_LLM_EXTRA_BODY) + else None, reflect_llm_provider=os.getenv(ENV_REFLECT_LLM_PROVIDER) or None, reflect_llm_api_key=os.getenv(ENV_REFLECT_LLM_API_KEY) or None, reflect_llm_model=os.getenv(ENV_REFLECT_LLM_MODEL) @@ -1494,6 +1503,9 @@ def from_env(cls) -> "HindsightConfig": if os.getenv(ENV_REFLECT_LLM_TIMEOUT) else None, reflect_llm_litellmrouter_config=_parse_llm_router_config(ENV_REFLECT_LLM_LITELLMROUTER_CONFIG), + reflect_llm_extra_body=json.loads(os.getenv(ENV_REFLECT_LLM_EXTRA_BODY)) + if os.getenv(ENV_REFLECT_LLM_EXTRA_BODY) + else None, consolidation_llm_provider=os.getenv(ENV_CONSOLIDATION_LLM_PROVIDER) or None, consolidation_llm_api_key=os.getenv(ENV_CONSOLIDATION_LLM_API_KEY) or None, consolidation_llm_model=os.getenv(ENV_CONSOLIDATION_LLM_MODEL) @@ -1519,6 +1531,9 @@ def from_env(cls) -> "HindsightConfig": if os.getenv(ENV_CONSOLIDATION_LLM_TIMEOUT) else None, consolidation_llm_litellmrouter_config=_parse_llm_router_config(ENV_CONSOLIDATION_LLM_LITELLMROUTER_CONFIG), + consolidation_llm_extra_body=json.loads(os.getenv(ENV_CONSOLIDATION_LLM_EXTRA_BODY)) + if os.getenv(ENV_CONSOLIDATION_LLM_EXTRA_BODY) + else None, # Embeddings embeddings_provider=os.getenv(ENV_EMBEDDINGS_PROVIDER, DEFAULT_EMBEDDINGS_PROVIDER), embeddings_local_model=os.getenv(ENV_EMBEDDINGS_LOCAL_MODEL, DEFAULT_EMBEDDINGS_LOCAL_MODEL), diff --git a/hindsight-api-slim/hindsight_api/engine/memory_engine.py b/hindsight-api-slim/hindsight_api/engine/memory_engine.py index 4f633e91b..65052fe44 100644 --- a/hindsight-api-slim/hindsight_api/engine/memory_engine.py +++ b/hindsight-api-slim/hindsight_api/engine/memory_engine.py @@ -573,7 +573,7 @@ def __init__( api_key=retain_api_key, base_url=retain_base_url, model=retain_model, - extra_body=config.llm_extra_body, + extra_body=config.retain_llm_extra_body or config.llm_extra_body, default_headers=config.llm_default_headers, litellmrouter_config=config.retain_llm_litellmrouter_config or config.llm_litellmrouter_config, ) @@ -597,7 +597,7 @@ def __init__( api_key=reflect_api_key, base_url=reflect_base_url, model=reflect_model, - extra_body=config.llm_extra_body, + extra_body=config.reflect_llm_extra_body or config.llm_extra_body, default_headers=config.llm_default_headers, litellmrouter_config=config.reflect_llm_litellmrouter_config or config.llm_litellmrouter_config, ) @@ -621,7 +621,7 @@ def __init__( api_key=consolidation_api_key, base_url=consolidation_base_url, model=consolidation_model, - extra_body=config.llm_extra_body, + extra_body=config.consolidation_llm_extra_body or config.llm_extra_body, default_headers=config.llm_default_headers, litellmrouter_config=config.consolidation_llm_litellmrouter_config or config.llm_litellmrouter_config, ) diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py index 717f1ed42..5898b8bf2 100644 --- a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py +++ b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py @@ -10,6 +10,7 @@ import asyncio import json import logging +import os import re import time from typing import TYPE_CHECKING, Any, Awaitable, Callable @@ -562,7 +563,22 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): if include_recall: forced_sequence.append("recall") - if iteration < len(forced_sequence): + # Escape hatch for inference engines whose tool-call parser breaks + # when tool_choice forces a specific function name. vLLM (as of v0.20.2 + # / nightly 2026-05-13) has open issue #35936 + #33965: forced + # tool_choice ("required" and named-function modes) bypasses the + # configured --tool-call-parser and uses JSON-only validation. For + # models that emit XML-style tool calls (Qwen3 family with qwen3_coder + # parser), this silently returns tool_calls=[] while finish_reason + # still reports "tool_calls". Set + # HINDSIGHT_API_REFLECT_DISABLE_FORCED_TOOL_CHOICE=true to fall back + # to tool_choice="auto" for every iteration; the model still calls + # retrieval tools when given factual queries, just without API-level + # forcing. + disable_forced = os.getenv( + "HINDSIGHT_API_REFLECT_DISABLE_FORCED_TOOL_CHOICE", "" + ).lower() in ("true", "1", "yes") + if not disable_forced and iteration < len(forced_sequence): iter_tool_choice: str | dict = {"type": "function", "function": {"name": forced_sequence[iteration]}} else: iter_tool_choice = "auto" diff --git a/hindsight-api-slim/tests/test_per_scope_llm_extra_body.py b/hindsight-api-slim/tests/test_per_scope_llm_extra_body.py new file mode 100644 index 000000000..021c51c50 --- /dev/null +++ b/hindsight-api-slim/tests/test_per_scope_llm_extra_body.py @@ -0,0 +1,104 @@ +"""Tests for per-scope ``HINDSIGHT_API__LLM_EXTRA_BODY`` env vars. + +Verifies: +- Each scope's extra_body env var parses as JSON into the Config dataclass. +- Per-scope value beats the global ``HINDSIGHT_API_LLM_EXTRA_BODY`` when set. +- Unset per-scope value falls back to the global. +- Unset global + unset per-scope = None (no extra_body merged at call time). +""" + +import json +import os + +import pytest + + +@pytest.fixture +def reset_env_extra_body(): + """Snapshot + restore all extra_body env vars around each test.""" + from hindsight_api.config import clear_config_cache + + keys = [ + "HINDSIGHT_API_LLM_EXTRA_BODY", + "HINDSIGHT_API_RETAIN_LLM_EXTRA_BODY", + "HINDSIGHT_API_REFLECT_LLM_EXTRA_BODY", + "HINDSIGHT_API_CONSOLIDATION_LLM_EXTRA_BODY", + # required to make get_config() succeed without a real LLM provider + "HINDSIGHT_API_SKIP_LLM_VERIFICATION", + "HINDSIGHT_API_LAZY_RERANKER", + "HINDSIGHT_API_LLM_PROVIDER", + "HINDSIGHT_API_LLM_MODEL", + ] + saved = {k: os.environ.get(k) for k in keys} + os.environ["HINDSIGHT_API_SKIP_LLM_VERIFICATION"] = "true" + os.environ["HINDSIGHT_API_LAZY_RERANKER"] = "true" + os.environ["HINDSIGHT_API_LLM_PROVIDER"] = "mock" + os.environ["HINDSIGHT_API_LLM_MODEL"] = "default-model" + # Per-scope vars start unset; tests opt-in by setting them explicitly. + for k in [ + "HINDSIGHT_API_LLM_EXTRA_BODY", + "HINDSIGHT_API_RETAIN_LLM_EXTRA_BODY", + "HINDSIGHT_API_REFLECT_LLM_EXTRA_BODY", + "HINDSIGHT_API_CONSOLIDATION_LLM_EXTRA_BODY", + ]: + os.environ.pop(k, None) + clear_config_cache() + yield + for k, v in saved.items(): + if v is None: + os.environ.pop(k, None) + else: + os.environ[k] = v + clear_config_cache() + + +class TestPerScopeLLMExtraBody: + def test_unset_yields_none_on_all_scopes(self, reset_env_extra_body): + from hindsight_api.config import get_config + + config = get_config() + assert config.llm_extra_body is None + assert config.retain_llm_extra_body is None + assert config.reflect_llm_extra_body is None + assert config.consolidation_llm_extra_body is None + + def test_per_scope_env_parses_as_json(self, reset_env_extra_body): + from hindsight_api.config import clear_config_cache, get_config + + os.environ["HINDSIGHT_API_RETAIN_LLM_EXTRA_BODY"] = json.dumps( + {"temperature": 0.6, "top_p": 0.8} + ) + os.environ["HINDSIGHT_API_REFLECT_LLM_EXTRA_BODY"] = json.dumps( + {"chat_template_kwargs": {"enable_thinking": True}} + ) + os.environ["HINDSIGHT_API_CONSOLIDATION_LLM_EXTRA_BODY"] = json.dumps( + {"chat_template_kwargs": {"enable_thinking": False}, "presence_penalty": 0.0} + ) + clear_config_cache() + config = get_config() + assert config.retain_llm_extra_body == {"temperature": 0.6, "top_p": 0.8} + assert config.reflect_llm_extra_body == {"chat_template_kwargs": {"enable_thinking": True}} + assert config.consolidation_llm_extra_body == { + "chat_template_kwargs": {"enable_thinking": False}, + "presence_penalty": 0.0, + } + + def test_global_only_leaves_per_scope_none(self, reset_env_extra_body): + from hindsight_api.config import clear_config_cache, get_config + + os.environ["HINDSIGHT_API_LLM_EXTRA_BODY"] = json.dumps({"temperature": 0.7}) + clear_config_cache() + config = get_config() + assert config.llm_extra_body == {"temperature": 0.7} + # Per-scope fields stay None — fallback happens at engine init, not parse time. + assert config.retain_llm_extra_body is None + assert config.reflect_llm_extra_body is None + assert config.consolidation_llm_extra_body is None + + def test_invalid_json_raises_at_parse_time(self, reset_env_extra_body): + from hindsight_api.config import clear_config_cache, get_config + + os.environ["HINDSIGHT_API_RETAIN_LLM_EXTRA_BODY"] = "{not valid json" + clear_config_cache() + with pytest.raises(json.JSONDecodeError): + get_config()