Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
8fac4b1
feat: add EAGLE3 speculative decoding support
ruixiang63 Dec 14, 2025
ac5667d
fix eagle3 logits sync bug & remove ggml_set_sync()
ruixiang63 Dec 16, 2025
3e7f376
Merge branch 'master' into pr/18039
ggerganov Dec 17, 2025
5a79c19
eagle3 : improve naming
ggerganov Dec 17, 2025
c0d99e6
add eagle3 support for Qwen3 series models
ruixiang63 Jan 8, 2026
71ba283
add eagle3 support for Qwen3 MoE models
ruixiang63 Jan 9, 2026
3da288d
eagle3: load lm_head from target model if not in draft model when con…
ruixiang63 Jan 10, 2026
13a9f31
eagle3: make d2t mapping optional
ruixiang63 Jan 10, 2026
75883cd
eagle3: add support for gpt-oss-120B eagle3
ruixiang63 Jan 10, 2026
7b78bfa
eagle3: add support for RedHtAI eagle3 speculator series models
ruixiang63 Jan 16, 2026
7d4c223
Merge branch 'master' into HEAD
ggerganov Feb 5, 2026
5e224bc
Merge branch 'master' into pr/18039
ggerganov Feb 9, 2026
b353792
eagle3: fix model convert issue
ruixiang63 Feb 20, 2026
9fea243
eagle3: fix model convert code format
ruixiang63 Feb 20, 2026
b8ab2cc
Merge branch 'master' into pr/18039
ggerganov Feb 23, 2026
07e2c97
eagle3: support --eagle3 in llama-cli
ruixiang63 Feb 28, 2026
5bb2d50
Merge branch 'master' into pr/18039
ggerganov Mar 16, 2026
91b03e4
Merge branch 'master' into pr/18039
ggerganov Apr 24, 2026
0724d66
dflash: first working POC
ruixiang63 Apr 18, 2026
85a0089
dflash: add support for qwen3.5/3.6 moe models
ruixiang63 Apr 19, 2026
e344c4a
dflash: remove rebundant logic & correct bias naming
ruixiang63 Apr 24, 2026
67cb0d5
dflash: enable llama-cli & llama-server with np=1
ruixiang63 Apr 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3087,7 +3087,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, bool value) {
params.use_jinja = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_JINJA"));
add_opt(common_arg(
{"--reasoning-format"}, "FORMAT",
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
Expand Down Expand Up @@ -3143,7 +3143,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.chat_template = value;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
add_opt(common_arg(
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
string_format(
Expand Down Expand Up @@ -3467,6 +3467,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.p_min = std::stof(value);
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
add_opt(common_arg(
{"--eagle3"},
"use EAGLE3 speculative decoding with the draft model",
[](common_params & params) {
params.speculative.eagle3 = true;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--dflash"},
"use DFlash speculative decoding with the draft model",
[](common_params & params) {
params.speculative.dflash = true;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-cd", "--ctx-size-draft"}, "N",
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
Expand Down
5 changes: 5 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ enum common_speculative_type {
COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding
COMMON_SPECULATIVE_TYPE_DRAFT, // draft model
COMMON_SPECULATIVE_TYPE_EAGLE3, // eagle draft model
COMMON_SPECULATIVE_TYPE_DFLASH, // dflash draft model
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
Expand Down Expand Up @@ -322,10 +323,14 @@ struct common_params_speculative {

struct common_params_model mparams_dft;

llama_model * model_tgt = nullptr; // the target model
llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts

llama_context_params cparams_dft; // these are the parameters for the draft llama_context

bool eagle3 = false; // use EAGLE3 speculative decoding
bool dflash = false; // use DFlash speculative decoding

int32_t n_ctx = 0; // draft context size
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)

Expand Down
Loading