server : fix handling of the ignore_eos flag (ggml-org#14710)

ggerganov · web-flow · commit 538cc77f7f44 · 2025-07-16T12:13:57.000+03:00
ggml-ci
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -127,7 +127,6 @@ struct slot_params {
     std::vector<std::string> response_fields;
     bool timings_per_token = false;
     bool post_sampling_probs = false;
-    bool ignore_eos = false;
 
     struct common_params_sampling sampling;
     struct common_params_speculative speculative;
@@ -441,7 +440,6 @@ struct server_task {
 
         {
             params.sampling.logit_bias.clear();
-            params.ignore_eos = json_value(data, "ignore_eos", false);
 
             const auto & logit_bias = data.find("logit_bias");
             if (logit_bias != data.end() && logit_bias->is_array()) {
@@ -472,6 +470,16 @@ struct server_task {
                     }
                 }
             }
+
+            params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos);
+            if (params.sampling.ignore_eos) {
+                for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
+                    if (llama_vocab_is_eog(vocab, i)) {
+                        //SRV_DBG("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(ctx, i).c_str(), -INFINITY);
+                        params.sampling.logit_bias.push_back({i, -INFINITY});
+                    }
+                }
+            }
         }
 
         {
@@ -2217,10 +2225,6 @@ struct server_context {
             slot.params.n_predict = slot.n_predict;
         }
 
-        if (slot.params.ignore_eos && has_eos_token) {
-            slot.params.sampling.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
-        }
-
         {
             if (slot.smpl != nullptr) {
                 common_sampler_free(slot.smpl);

Original file line number	Diff line number	Diff line change
`@@ -127,7 +127,6 @@ struct slot_params {`
`127`	`127`	`std::vector<std::string> response_fields;`
`128`	`128`	`bool timings_per_token = false;`
`129`	`129`	`bool post_sampling_probs = false;`
`130`		`- bool ignore_eos = false;`
`131`	`130`
`132`	`131`	`struct common_params_sampling sampling;`
`133`	`132`	`struct common_params_speculative speculative;`
`@@ -441,7 +440,6 @@ struct server_task {`
`441`	`440`
`442`	`441`	`{`
`443`	`442`	`params.sampling.logit_bias.clear();`
`444`		`- params.ignore_eos = json_value(data, "ignore_eos", false);`
`445`	`443`
`446`	`444`	`const auto & logit_bias = data.find("logit_bias");`
`447`	`445`	`if (logit_bias != data.end() && logit_bias->is_array()) {`
`@@ -472,6 +470,16 @@ struct server_task {`
`472`	`470`	`}`
`473`	`471`	`}`
`474`	`472`	`}`
	`473`	`+`
	`474`	`+ params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos);`
	`475`	`+ if (params.sampling.ignore_eos) {`
	`476`	`+ for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {`
	`477`	`+ if (llama_vocab_is_eog(vocab, i)) {`
	`478`	`+ //SRV_DBG("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(ctx, i).c_str(), -INFINITY);`
	`479`	`+ params.sampling.logit_bias.push_back({i, -INFINITY});`
	`480`	`+ }`
	`481`	`+ }`
	`482`	`+ }`
`475`	`483`	`}`
`476`	`484`
`477`	`485`	`{`
`@@ -2217,10 +2225,6 @@ struct server_context {`
`2217`	`2225`	`slot.params.n_predict = slot.n_predict;`
`2218`	`2226`	`}`
`2219`	`2227`
`2220`		`- if (slot.params.ignore_eos && has_eos_token) {`
`2221`		`- slot.params.sampling.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});`
`2222`		`- }`
`2223`		`-`
`2224`	`2228`	`{`
`2225`	`2229`	`if (slot.smpl != nullptr) {`
`2226`	`2230`	`common_sampler_free(slot.smpl);`