Skip to content

Commit 7a3f5e6

Browse files
gabe-l-hartnjsyw1997
authored andcommitted
Thinking model disabled assistant prefill (ggml-org#15404)
* feat: Set enable_thinking IFF not disabled and supported Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <[email protected]> * fix: Fix inverted logic condition for prefill error Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <[email protected]> * fix: Always parse the enable_thinking kwarg to overwrite the default value From what I can tell, this started as a Qwen3-specific keyword, but from the use in `chat.cpp` translates this inputs.enable_thinking to the right thinking kwarg for the given model, this is now more of a standardized kwarg, so it should always override the default value when sent as part of the chat_template_kwargs field in the API. Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <[email protected]> * fix: Don't limit tempalte expansion check to jinja With the use_jinja check, non-jinja models would enable thinking and always fail assistant prefill Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <[email protected]> * feat: Add the error text to json type errors in json_value Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <[email protected]> * feat: Explicitly reject string values for "enable_thinking" There are too many possible "truthy" / "falsy" strings and too many ambiguous strings that don't have a clear truthy/falsy value, so the simplest thing to do here is to reject the request. Ideally, this would be a 422 (Unprocessable Entity), but right now it's coming back as a 500. Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <[email protected]> * refactor: Move logic for detecting template enable_thinking support to common Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <[email protected]> * fix: Use raw pointer for common chat template function Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <[email protected]> --------- Signed-off-by: Gabe Goodhart <[email protected]>
1 parent f12109b commit 7a3f5e6

File tree

4 files changed

+35
-4
lines changed

4 files changed

+35
-4
lines changed

common/chat.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,19 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
163163
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
164164
}
165165

166+
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
167+
common_chat_templates_inputs dummy_inputs;
168+
common_chat_msg msg;
169+
msg.role = "user";
170+
msg.content = "test";
171+
dummy_inputs.messages = {msg};
172+
dummy_inputs.enable_thinking = false;
173+
const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
174+
dummy_inputs.enable_thinking = true;
175+
const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
176+
return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
177+
}
178+
166179
template <>
167180
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
168181
std::vector<common_chat_msg> msgs;

common/chat.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_p
199199

200200
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
201201

202+
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
203+
202204
// Parses a JSON array of messages in OpenAI's chat completion API format.
203205
// T can be std::string containing JSON or nlohmann::ordered_json
204206
template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);

tools/server/server.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2267,6 +2267,12 @@ struct server_context {
22672267

22682268
metrics.init();
22692269

2270+
// thinking is enabled if:
2271+
// 1. It's not explicitly disabled (reasoning_budget == 0)
2272+
// 2. The chat template supports it
2273+
const bool enable_thinking = params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
2274+
SRV_INF("Enable thinking? %d\n", enable_thinking);
2275+
22702276
oai_parser_opt = {
22712277
/* use_jinja */ params_base.use_jinja,
22722278
/* prefill_assistant */ params_base.prefill_assistant,
@@ -2275,7 +2281,7 @@ struct server_context {
22752281
/* common_chat_templates */ chat_templates.get(),
22762282
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
22772283
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
2278-
/* enable_thinking */ params_base.reasoning_budget != 0,
2284+
/* enable_thinking */ enable_thinking,
22792285
};
22802286
}
22812287

tools/server/utils.hpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul
5454
if (body.contains(key) && !body.at(key).is_null()) {
5555
try {
5656
return body.at(key);
57-
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
58-
LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
57+
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) {
58+
LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what());
5959
return default_value;
6060
}
6161
} else {
@@ -708,6 +708,16 @@ static json oaicompat_chat_params_parse(
708708
inputs.chat_template_kwargs[item.key()] = item.value().dump();
709709
}
710710

711+
// parse the "enable_thinking" kwarg to override the default value
712+
auto enable_thinking_kwarg = json_value(inputs.chat_template_kwargs, "enable_thinking", std::string(""));
713+
if (enable_thinking_kwarg == "true") {
714+
inputs.enable_thinking = true;
715+
} else if (enable_thinking_kwarg == "false") {
716+
inputs.enable_thinking = false;
717+
} else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') {
718+
throw std::runtime_error("invalid type for \"enable_thinking\" (expected boolean, got string)");
719+
}
720+
711721
// if the assistant message appears at the end of list, we do not add end-of-turn token
712722
// for ex. this can be useful to modify the reasoning process in reasoning models
713723
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
@@ -724,7 +734,7 @@ static json oaicompat_chat_params_parse(
724734
/* TODO: test this properly */
725735
inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
726736

727-
if ( (!inputs.enable_thinking) || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) {
737+
if ( inputs.enable_thinking ) {
728738
throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
729739
}
730740

0 commit comments

Comments
 (0)