From f6648cfd0c0f69ec7cd9945475c4d8c80e9219a1 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Mon, 18 Aug 2025 12:41:48 -0600
Subject: [PATCH 1/6] feat: Set enable_thinking IFF not disabled and supported

Branch: gabe-l-hart/thinking-model-disabled-agent-prefill

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 tools/server/server.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 35b060674bbcb..87cef76de942f 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2188,6 +2188,24 @@ struct server_context {
 
         metrics.init();
 
+        // thinking is enabled if:
+        // 1. It's not explicitly disabled (reasoning_budget == 0)
+        // 2. The chat template supports it
+        bool enable_thinking = params_base.reasoning_budget != 0;
+        if (enable_thinking && params_base.use_jinja) {
+            common_chat_templates_inputs dummy_inputs;
+            common_chat_msg msg;
+            msg.role = "user";
+            msg.content = "test";
+            dummy_inputs.messages = {msg};
+            dummy_inputs.enable_thinking = false;
+            const auto rendered_no_thinking = common_chat_templates_apply(chat_templates.get(), dummy_inputs);
+            dummy_inputs.enable_thinking = true;
+            const auto rendered_with_thinking = common_chat_templates_apply(chat_templates.get(), dummy_inputs);
+            enable_thinking = rendered_no_thinking.prompt != rendered_with_thinking.prompt;
+        }
+        SRV_INF("Enable thinking? %d\n", enable_thinking);
+
         oai_parser_opt = {
             /* use_jinja             */ params_base.use_jinja,
             /* prefill_assistant     */ params_base.prefill_assistant,

From 8e4c1c123b51a956cb2adc4347d556cfb5a540fb Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Mon, 18 Aug 2025 12:42:10 -0600
Subject: [PATCH 2/6] fix: Fix inverted logic condition for prefill error

Branch: gabe-l-hart/thinking-model-disabled-agent-prefill

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 tools/server/server.cpp | 2 +-
 tools/server/utils.hpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 87cef76de942f..bb1e3775d9fb2 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2214,7 +2214,7 @@ struct server_context {
             /* common_chat_templates */ chat_templates.get(),
             /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
             /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
-            /* enable_thinking       */ params_base.reasoning_budget != 0,
+            /* enable_thinking       */ enable_thinking,
         };
     }
 
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index f3dfc8225da4d..20fb52350c8e5 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -782,7 +782,7 @@ static json oaicompat_chat_params_parse(
         /* TODO: test this properly */
         inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
 
-        if ( (!inputs.enable_thinking) || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) {
+        if ( inputs.enable_thinking || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) {
             throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
         }
 

From c31c9bcf850a10eaa40398d4f41961091f35f6ba Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Mon, 18 Aug 2025 13:22:00 -0600
Subject: [PATCH 3/6] fix: Always parse the enable_thinking kwarg to overwrite
 the default value

From what I can tell, this started as a Qwen3-specific keyword, but from
the use in `chat.cpp` translates this inputs.enable_thinking to the right
thinking kwarg for the given model, this is now more of a standardized
kwarg, so it should always override the default value when sent as part of
the chat_template_kwargs field in the API.

Branch: gabe-l-hart/thinking-model-disabled-agent-prefill

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 tools/server/utils.hpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 20fb52350c8e5..0c7e0f05c21a0 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -766,6 +766,14 @@ static json oaicompat_chat_params_parse(
         inputs.chat_template_kwargs[item.key()] = item.value().dump();
     }
 
+    // parse the "enable_thinking" kwarg to override the default value
+    auto enable_thinking_kwarg = json_value(inputs.chat_template_kwargs, "enable_thinking", std::string(""));
+    if (enable_thinking_kwarg == "true") {
+        inputs.enable_thinking = true;
+    } else if (enable_thinking_kwarg == "false") {
+        inputs.enable_thinking = false;
+    }
+
     // if the assistant message appears at the end of list, we do not add end-of-turn token
     // for ex. this can be useful to modify the reasoning process in reasoning models
     bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
@@ -782,7 +790,7 @@ static json oaicompat_chat_params_parse(
         /* TODO: test this properly */
         inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
 
-        if ( inputs.enable_thinking || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) {
+        if ( inputs.enable_thinking ) {
             throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
         }
 

From 7cf596359271efa19a131bd90c42ca0e5b6cefd1 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Mon, 18 Aug 2025 14:02:16 -0600
Subject: [PATCH 4/6] fix: Don't limit tempalte expansion check to jinja

With the use_jinja check, non-jinja models would enable thinking and always
fail assistant prefill

Branch: gabe-l-hart/thinking-model-disabled-agent-prefill

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 tools/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index bb1e3775d9fb2..47784e07be338 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2192,7 +2192,7 @@ struct server_context {
         // 1. It's not explicitly disabled (reasoning_budget == 0)
         // 2. The chat template supports it
         bool enable_thinking = params_base.reasoning_budget != 0;
-        if (enable_thinking && params_base.use_jinja) {
+        if (enable_thinking) {
             common_chat_templates_inputs dummy_inputs;
             common_chat_msg msg;
             msg.role = "user";

From 4c06dcacc7d4fa1cbb1318a743e7d43ab92b767c Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Wed, 20 Aug 2025 10:28:01 -0600
Subject: [PATCH 5/6] feat: Add the error text to json type errors in
 json_value

Branch: gabe-l-hart/thinking-model-disabled-agent-prefill

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 tools/server/utils.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 0c7e0f05c21a0..6e5a39788cb8f 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -54,8 +54,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul
     if (body.contains(key) && !body.at(key).is_null()) {
         try {
             return body.at(key);
-        } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
-            LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
+        } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) {
+            LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what());
             return default_value;
         }
     } else {

From 6adae511a5823815237f259a575887ec1827eca3 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Wed, 20 Aug 2025 10:34:06 -0600
Subject: [PATCH 6/6] feat: Explicitly reject string values for
 "enable_thinking"

There are too many possible "truthy" / "falsy" strings and too many
ambiguous strings that don't have a clear truthy/falsy value, so the
simplest thing to do here is to reject the request. Ideally, this would be
a 422 (Unprocessable Entity), but right now it's coming back as a 500.

Branch: gabe-l-hart/thinking-model-disabled-agent-prefill

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 tools/server/utils.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 6e5a39788cb8f..8bd0f8d097caa 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -772,6 +772,8 @@ static json oaicompat_chat_params_parse(
         inputs.enable_thinking = true;
     } else if (enable_thinking_kwarg == "false") {
         inputs.enable_thinking = false;
+    } else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') {
+        throw std::runtime_error("invalid type for \"enable_thinking\" (expected boolean, got string)");
     }
 
     // if the assistant message appears at the end of list, we do not add end-of-turn token