diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 35b060674bbcb..47784e07be338 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2188,6 +2188,24 @@ struct server_context { metrics.init(); + // thinking is enabled if: + // 1. It's not explicitly disabled (reasoning_budget == 0) + // 2. The chat template supports it + bool enable_thinking = params_base.reasoning_budget != 0; + if (enable_thinking) { + common_chat_templates_inputs dummy_inputs; + common_chat_msg msg; + msg.role = "user"; + msg.content = "test"; + dummy_inputs.messages = {msg}; + dummy_inputs.enable_thinking = false; + const auto rendered_no_thinking = common_chat_templates_apply(chat_templates.get(), dummy_inputs); + dummy_inputs.enable_thinking = true; + const auto rendered_with_thinking = common_chat_templates_apply(chat_templates.get(), dummy_inputs); + enable_thinking = rendered_no_thinking.prompt != rendered_with_thinking.prompt; + } + SRV_INF("Enable thinking? %d\n", enable_thinking); + oai_parser_opt = { /* use_jinja */ params_base.use_jinja, /* prefill_assistant */ params_base.prefill_assistant, @@ -2196,7 +2214,7 @@ struct server_context { /* common_chat_templates */ chat_templates.get(), /* allow_image */ mctx ? mtmd_support_vision(mctx) : false, /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, - /* enable_thinking */ params_base.reasoning_budget != 0, + /* enable_thinking */ enable_thinking, }; } diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index f3dfc8225da4d..8bd0f8d097caa 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -54,8 +54,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul if (body.contains(key) && !body.at(key).is_null()) { try { return body.at(key); - } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) { - LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name()); + } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) { + LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what()); return default_value; } } else { @@ -766,6 +766,16 @@ static json oaicompat_chat_params_parse( inputs.chat_template_kwargs[item.key()] = item.value().dump(); } + // parse the "enable_thinking" kwarg to override the default value + auto enable_thinking_kwarg = json_value(inputs.chat_template_kwargs, "enable_thinking", std::string("")); + if (enable_thinking_kwarg == "true") { + inputs.enable_thinking = true; + } else if (enable_thinking_kwarg == "false") { + inputs.enable_thinking = false; + } else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') { + throw std::runtime_error("invalid type for \"enable_thinking\" (expected boolean, got string)"); + } + // if the assistant message appears at the end of list, we do not add end-of-turn token // for ex. this can be useful to modify the reasoning process in reasoning models bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant; @@ -782,7 +792,7 @@ static json oaicompat_chat_params_parse( /* TODO: test this properly */ inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE; - if ( (!inputs.enable_thinking) || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) { + if ( inputs.enable_thinking ) { throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking."); }