diff --git a/common/arg.cpp b/common/arg.cpp index 40af7e574830f..d4a3f5b7f21ec 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.swa_full = true; } ).set_env("LLAMA_ARG_SWA_FULL")); + add_opt(common_arg( + {"--graph-reuse", "-gr"}, + string_format("reuse previous compute graphs when possible (default: %s)" + "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14482)", params.graph_reuse ? "true" : "false"), + [](common_params & params) { + params.graph_reuse = true; + } + ).set_env("LLAMA_ARG_GRAPH_REUSE")); add_opt(common_arg( {"--no-context-shift"}, string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), diff --git a/common/common.cpp b/common/common.cpp index e4e71ad13fb59..c7c163404a2af 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1157,6 +1157,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.no_perf = params.no_perf; cparams.op_offload = !params.no_op_offload; cparams.swa_full = params.swa_full; + cparams.graph_reuse = params.graph_reuse; cparams.type_k = params.cache_type_k; cparams.type_v = params.cache_type_v; diff --git a/common/common.h b/common/common.h index 8922090e7b10d..465d05672d2a6 100644 --- a/common/common.h +++ b/common/common.h @@ -330,6 +330,7 @@ struct common_params { bool no_perf = false; // disable performance metrics bool ctx_shift = true; // context shift on inifinite text generation bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) + bool graph_reuse = false; // reuse previous compute graphs when possible bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool use_mmap = true; // use mmap for faster loads diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 40fc315e82fd1..de3dcb60754f3 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -821,13 +821,23 @@ static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) { // the callback given to the thread pool void (^encode_async)(size_t ith); + void (^encode_next)(void); // n_cb command buffers + 1 used by the main thread struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; + struct ggml_metal_command_buffer cmd_bufs_next[2]; // abort ggml_metal_graph_compute if callback returns true ggml_abort_callback abort_callback; void * abort_callback_data; + + // reuse info + int i_next; + + int n_nodes_max; + int n_nodes_prev; + + struct ggml_tensor * cg_nodes; }; // MSL code @@ -1084,6 +1094,7 @@ @implementation GGMLMetalClass ctx->gf = nil; ctx->encode_async = nil; + ctx->encode_next = nil; for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { ctx->cmd_bufs[i].obj = nil; @@ -1091,6 +1102,13 @@ @implementation GGMLMetalClass ctx->cmd_bufs[i].mem_pool->device = device; } + for (int i = 0; i < 2; ++i) { + ctx->cmd_bufs_next[i].obj = nil; + + ctx->cmd_bufs_next[i].mem_pool = ggml_metal_mem_pool_init(); + ctx->cmd_bufs_next[i].mem_pool->device = device; + } + #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, device.recommendedMaxWorkingSetSize / 1e6); @@ -1521,6 +1539,13 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true); } + ctx->i_next = 0; + + ctx->n_nodes_max = 16384; + ctx->n_nodes_prev = -1; + + ctx->cg_nodes = ggml_aligned_malloc(ctx->n_nodes_max * sizeof(struct ggml_tensor)); + return ctx; } @@ -1532,6 +1557,7 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { } Block_release(ctx->encode_async); + Block_release(ctx->encode_next); [ctx->queue release]; @@ -1541,8 +1567,13 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool); } + ggml_metal_mem_pool_free(ctx->cmd_bufs_next[0].mem_pool); + ggml_metal_mem_pool_free(ctx->cmd_bufs_next[1].mem_pool); + dispatch_release(ctx->d_queue); + ggml_aligned_free(ctx->cg_nodes, ctx->n_nodes_max * sizeof(struct ggml_tensor)); + free(ctx); } @@ -5448,6 +5479,39 @@ static enum ggml_status ggml_metal_graph_compute( struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; + //const int64_t t_start = ggml_time_us(); + + ///////////////////////////////////////////////////// + // hacky way to determine that the graph is the same as the previous one + // + bool can_reuse = true; + + if (gf->n_nodes > ctx->n_nodes_max) { + can_reuse = false; + } + + if (gf->n_nodes != ctx->n_nodes_prev) { + can_reuse = false; + } + + if (can_reuse) { + for (int i = 0; i < gf->n_nodes; ++i) { + if (memcmp(gf->nodes[i], ctx->cg_nodes + i, sizeof(struct ggml_tensor)) != 0) { + can_reuse = false; + break; + } + } + } + + if (!can_reuse) { + ctx->n_nodes_prev = gf->n_nodes; + + for (int i = 0; i < gf->n_nodes; ++i) { + memcpy(ctx->cg_nodes + i, gf->nodes[i], sizeof(struct ggml_tensor)); + } + } + ////////////////////////////////////////////////////// + // number of nodes encoded by the main thread (empirically determined) const int n_main = 128; @@ -5492,78 +5556,126 @@ static enum ggml_status ggml_metal_graph_compute( } } - // the main thread commits the first few commands immediately - // cmd_buf[n_cb] - { - id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; - ctx->cmd_bufs[n_cb].obj = cmd_buf; - - [cmd_buf enqueue]; - ctx->encode_async(n_cb); - } - - // prepare the rest of the command buffers asynchronously - // cmd_buf[0.. n_cb) - for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; - ctx->cmd_bufs[cb_idx].obj = cmd_buf; + if (!can_reuse) { + // the main thread commits the first few commands immediately + // cmd_buf[n_cb] + { + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->cmd_bufs[n_cb].obj = cmd_buf; - // always enqueue the first two command buffers - // enqueue all of the command buffers if we don't need to abort - if (cb_idx < 2 || ctx->abort_callback == NULL) { [cmd_buf enqueue]; + ctx->encode_async(n_cb); } - } - - dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async); - // wait for completion and check status of each command buffer - // needed to detect if the device ran out-of-memory for example (#1881) - { - id cmd_buf = ctx->cmd_bufs[n_cb].obj; - [cmd_buf waitUntilCompleted]; + // prepare the rest of the command buffers asynchronously + // cmd_buf[0.. n_cb) + for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->cmd_bufs[cb_idx].obj = cmd_buf; - MTLCommandBufferStatus status = [cmd_buf status]; - if (status != MTLCommandBufferStatusCompleted) { - GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); - if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + // always enqueue the first two command buffers + // enqueue all of the command buffers if we don't need to abort + if (cb_idx < 2 || ctx->abort_callback == NULL) { + [cmd_buf enqueue]; } - - return GGML_STATUS_FAILED; } - } - for (int i = 0; i < n_cb; ++i) { - id cmd_buf = ctx->cmd_bufs[i].obj; - [cmd_buf waitUntilCompleted]; + dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async); - MTLCommandBufferStatus status = [cmd_buf status]; - if (status != MTLCommandBufferStatusCompleted) { - GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); - if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + // encode the command buffer for the next iter while the GPU has already started + { + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + [cmd_buf retain]; + if (ctx->cmd_bufs_next[ctx->i_next].obj != nil) { + [ctx->cmd_bufs_next[ctx->i_next].obj release]; } + ctx->cmd_bufs_next[ctx->i_next].obj = cmd_buf; - return GGML_STATUS_FAILED; + ctx->encode_next(); } - id next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil); - if (!next_buffer) { - continue; + // wait for completion and check status of each command buffer + // needed to detect if the device ran out-of-memory for example (#1881) + { + id cmd_buf = ctx->cmd_bufs[n_cb].obj; + [cmd_buf waitUntilCompleted]; + + MTLCommandBufferStatus status = [cmd_buf status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); + if (status == MTLCommandBufferStatusError) { + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + } + + return GGML_STATUS_FAILED; + } } - const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); - if (next_queued) { - continue; + for (int i = 0; i < n_cb; ++i) { + id cmd_buf = ctx->cmd_bufs[i].obj; + [cmd_buf waitUntilCompleted]; + + MTLCommandBufferStatus status = [cmd_buf status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); + if (status == MTLCommandBufferStatusError) { + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + } + + return GGML_STATUS_FAILED; + } + + id next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil); + if (!next_buffer) { + continue; + } + + const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); + if (next_queued) { + continue; + } + + if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { + GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i); + return GGML_STATUS_ABORTED; + } + + [next_buffer commit]; } + } else { + struct ggml_metal_command_buffer cmd_buf_cur = ctx->cmd_bufs_next[(ctx->i_next + 1)%2]; + + // directly submit the command buffer that we have prepared in the previous iteration + [ctx->cmd_bufs_next[(ctx->i_next + 1)%2].obj commit]; - if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { - GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i); - return GGML_STATUS_ABORTED; + // encode the command buffer for the next iter + { + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + [cmd_buf retain]; + if (ctx->cmd_bufs_next[ctx->i_next].obj != nil) { + [ctx->cmd_bufs_next[ctx->i_next].obj release]; + } + ctx->cmd_bufs_next[ctx->i_next].obj = cmd_buf; + + ctx->encode_next(); } - [next_buffer commit]; + // wait for completion and check status of each command buffer + // needed to detect if the device ran out-of-memory for example (#1881) + { + id cmd_buf = cmd_buf_cur.obj; + [cmd_buf waitUntilCompleted]; + + MTLCommandBufferStatus status = [cmd_buf status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, ctx->i_next, status); + if (status == MTLCommandBufferStatusError) { + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + } + + return GGML_STATUS_FAILED; + } + } } if (!should_capture && ctx->capture_started) { @@ -5572,6 +5684,8 @@ static enum ggml_status ggml_metal_graph_compute( } } + //printf(" time = %.3f ms\n", (float)(ggml_time_us() - t_start)/1000.0f); + return GGML_STATUS_SUCCESS; } @@ -5919,6 +6033,10 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { Block_release(ctx->encode_async); } + if (ctx->encode_next) { + Block_release(ctx->encode_next); + } + ctx->encode_async = Block_copy(^(size_t iter) { const int cb_idx = iter; const int n_cb_l = ctx->n_cb; @@ -5967,6 +6085,40 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { [cmd_buf commit]; } }); + + ctx->encode_next = Block_copy(^(void) { + id cmd_buf = ctx->cmd_bufs_next[ctx->i_next].obj; + + id encoder = [cmd_buf computeCommandEncoder]; + + int node_start = 0; + int node_end = ctx->gf->n_nodes; + + const bool should_capture = ctx->capture_next_compute; + + struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs_next[ctx->i_next].mem_pool; + ggml_metal_mem_pool_reset(mem_pool); + + for (int idx = node_start; idx < node_end; ++idx) { + if (should_capture) { + [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; + } + + const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool); + + if (should_capture) { + [encoder popDebugGroup]; + } + + if (!res) { + break; + } + } + + [encoder endEncoding]; + + ctx->i_next = (ctx->i_next + 1) % 2; + }); } static struct ggml_backend_i ggml_backend_metal_i = { diff --git a/include/llama.h b/include/llama.h index 3eda9bc68608c..1e8228997f1d8 100644 --- a/include/llama.h +++ b/include/llama.h @@ -374,6 +374,8 @@ extern "C" { bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 + + bool graph_reuse; // reuse previous compute graphs when possible }; // model quantization parameters @@ -1429,6 +1431,7 @@ extern "C" { int32_t n_p_eval; int32_t n_eval; + int32_t n_reused; }; struct llama_perf_sampler_data { diff --git a/src/llama-batch.h b/src/llama-batch.h index 3420803ff9469..b27134c9331a9 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -34,6 +34,31 @@ struct llama_ubatch { llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id int32_t * seq_idx; // [LLAMA_MAX_SEQ] | - | seq_idx int8_t * output; // [n_tokens] | i | - + + bool is_same(const llama_ubatch & other) const { + bool res = + equal_seqs == other.equal_seqs && + n_tokens == other.n_tokens && + n_seq_tokens == other.n_seq_tokens && + n_seqs == other.n_seqs && + n_seqs_unq == other.n_seqs_unq && + ( + (!token && !other.token) || + (!embd && !other.embd) + ); + + if (!res) { + return false; + } + + // TODO: this won't work because seq_id_unq ptr can point to an old balloc that has + // been freed by this point. find a way to fix this + //for (uint32_t s = 0; s < n_seqs_unq; ++s) { + // res &= seq_id_unq[s] == other.seq_id_unq[s]; + //} + + return res; + } }; // a helper for sanitizing, fulfilling and splitting a batch diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 06e93b19cbf40..8a1e2a86f9709 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -101,7 +101,8 @@ llama_context::llama_context( cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); - cparams.op_offload = params.op_offload; + cparams.op_offload = params.op_offload; + cparams.graph_reuse = params.graph_reuse; const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; @@ -227,8 +228,8 @@ llama_context::llama_context( LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes); - // buffer used to store the computation graph and the tensor meta data - buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + gf_res_prev.reset(new llm_graph_result(max_nodes)); + gf_res_reserve.reset(new llm_graph_result(max_nodes)); // TODO: move these checks to ggml_backend_sched // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary @@ -388,10 +389,6 @@ ggml_backend_sched_t llama_context::get_sched() const { return sched.get(); } -ggml_context * llama_context::get_ctx_compute() const { - return ctx_compute.get(); -} - uint32_t llama_context::n_ctx() const { return cparams.n_ctx; } @@ -678,38 +675,52 @@ bool llama_context::apply_adapter_cvec( return cvec.apply(model, data, len, n_embd, il_start, il_end); } -llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { +llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { if (mctx && !mctx->apply()) { LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__); ret = GGML_STATUS_FAILED; return nullptr; } - auto * gf = graph_init(); - if (!gf) { - LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__); - ret = GGML_STATUS_FAILED; - return nullptr; - } + auto * res = gf_res_prev.get(); + auto * gf = res->get_gf(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mctx); - if (!res) { - LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__); - ret = GGML_STATUS_FAILED; - return nullptr; - } + // the new graph parameters + // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters + const auto gparams = graph_params(res, ubatch, mctx, gtype); - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + const bool can_reuse = cparams.graph_reuse && res->update(gparams); + if (can_reuse) { + LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__); + n_reused++; + } else { + res->reset(); - if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__); - ret = GGML_STATUS_ALLOC_FAILED; - return nullptr; + ggml_backend_sched_reset(sched.get()); + ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + + //const auto t_start_us = ggml_time_us(); + + gf = model.build_graph(gparams); + + //LLAMA_LOG_INFO("graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0); + + if (!gf) { + LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__); + ret = GGML_STATUS_FAILED; + return nullptr; + } + + if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__); + ret = GGML_STATUS_ALLOC_FAILED; + return nullptr; + } } res->set_inputs(&ubatch); - const auto status = graph_compute(gf, ubatch.n_tokens > 1); + const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1); if (status != GGML_STATUS_SUCCESS) { LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status); ret = status; @@ -767,9 +778,6 @@ int llama_context::encode(const llama_batch & batch_inp) { n_outputs = n_tokens; - ggml_backend_sched_reset(sched.get()); - ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - const auto causal_attn_org = cparams.causal_attn; // always use non-causal attention for encoder graphs @@ -778,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) { cparams.causal_attn = false; ggml_status status; - const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status); + const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status); cparams.causal_attn = causal_attn_org; @@ -846,7 +854,9 @@ int llama_context::encode(const llama_batch & batch_inp) { // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation. - ggml_backend_sched_reset(sched.get()); + if (!cparams.graph_reuse) { + ggml_backend_sched_reset(sched.get()); + } // TODO: hacky solution if (model.arch == LLM_ARCH_T5 && t_embd) { @@ -1005,11 +1015,8 @@ int llama_context::decode(const llama_batch & batch_inp) { n_outputs = n_outputs_new; } - ggml_backend_sched_reset(sched.get()); - ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - ggml_status status; - const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status); + const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status); if (!res) { // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache @@ -1192,7 +1199,9 @@ int llama_context::decode(const llama_batch & batch_inp) { // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation. - ggml_backend_sched_reset(sched.get()); + if (!cparams.graph_reuse) { + ggml_backend_sched_reset(sched.get()); + } return 0; } @@ -1275,20 +1284,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { // graph // -int32_t llama_context::graph_max_nodes() const { - return std::max(65536, 5*model.n_tensors()); -} - -ggml_cgraph * llama_context::graph_init() { - ggml_init_params params = { - /*.mem_size =*/ buf_compute_meta.size(), - /*.mem_buffer =*/ buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - - ctx_compute.reset(ggml_init(params)); - - return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false); +uint32_t llama_context::graph_max_nodes() const { + return std::max(65536u, 5u*model.n_tensors()); } ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) { @@ -1301,6 +1298,9 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs); } + gf_res_prev->reset(); + ggml_backend_sched_reset(sched.get()); + // store the n_outputs as it is, and restore it afterwards // TODO: not sure if needed, might simplify in the future by removing this const auto save_n_outputs = this->n_outputs; @@ -1310,17 +1310,15 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u llama_batch_allocr balloc(model.hparams.n_pos_per_embd()); llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs); - auto * gf = graph_init(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx); + auto * res = gf_res_reserve.get(); - this->n_outputs = save_n_outputs; + const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT); - if (!res) { - LLAMA_LOG_ERROR("%s: failed to build worst-case graph\n", __func__); - return nullptr; - } + res->reset(); - ggml_backend_sched_reset(sched.get()); + auto * gf = model.build_graph(gparams); + + this->n_outputs = save_n_outputs; // initialize scheduler with the specified graph if (!ggml_backend_sched_reserve(sched.get(), gf)) { @@ -1331,28 +1329,27 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u return gf; } -llm_graph_result_ptr llama_context::graph_build( - ggml_context * ctx, - ggml_cgraph * gf, - const llama_ubatch & ubatch, - llm_graph_type gtype, - const llama_memory_context_i * mctx) { - return model.build_graph( - { - /*.ctx =*/ ctx, - /*.arch =*/ model.arch, - /*.hparams =*/ model.hparams, - /*.cparams =*/ cparams, - /*.ubatch =*/ ubatch, - /*.sched =*/ sched.get(), - /*.backend_cpu =*/ backend_cpu, - /*.cvec =*/ &cvec, - /*.loras =*/ &loras, - /*.mctx =*/ mctx, - /*.cross =*/ &cross, - /*.n_outputs =*/ n_outputs, - /*.cb =*/ graph_get_cb(), - }, gf, gtype); +llm_graph_params llama_context::graph_params( + llm_graph_result_i * res, + const llama_ubatch & ubatch, + const llama_memory_context_i * mctx, + llm_graph_type gtype) const { + return { + /*.arch =*/ model.arch, + /*.hparams =*/ model.hparams, + /*.cparams =*/ cparams, + /*.ubatch =*/ ubatch, + /*.gtype =*/ gtype, + /*.sched =*/ sched.get(), + /*.backend_cpu =*/ backend_cpu, + /*.cvec =*/ &cvec, + /*.loras =*/ &loras, + /*.mctx =*/ mctx, + /*.cross =*/ &cross, + /*.n_outputs =*/ n_outputs, + /*.cb =*/ graph_get_cb(), + /*.res =*/ res, + }; } ggml_status llama_context::graph_compute( @@ -1930,6 +1927,7 @@ llama_perf_context_data llama_context::perf_get_data() const { data.t_eval_ms = 1e-3 * t_eval_us; data.n_p_eval = std::max(1, n_p_eval); data.n_eval = std::max(1, n_eval); + data.n_reused = std::max(0, n_reused); return data; } @@ -1938,6 +1936,7 @@ void llama_context::perf_reset() { t_start_us = ggml_time_us(); t_eval_us = n_eval = 0; t_p_eval_us = n_p_eval = 0; + n_reused = 0; } // @@ -2064,8 +2063,13 @@ void llama_context::opt_epoch_iter( break; } - auto * gf = graph_init(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get()); + auto * res = gf_res_prev.get(); + + const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT); + + res->reset(); + + auto * gf = model.build_graph(gparams); struct ggml_context * ctx_compute_opt; { @@ -2187,6 +2191,7 @@ llama_context_params llama_context_default_params() { /*.no_perf =*/ true, /*.op_offload =*/ true, /*.swa_full =*/ true, + /*.graph_reuse =*/ false, }; return result; @@ -2807,6 +2812,7 @@ void llama_perf_context_print(const llama_context * ctx) { LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); + LLAMA_LOG_INFO("%s: graphs reused = %10d\n", __func__, data.n_reused); } void llama_perf_context_reset(llama_context * ctx) { diff --git a/src/llama-context.h b/src/llama-context.h index 9ce05715a8c03..21d23d83851d1 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -35,8 +35,6 @@ struct llama_context { ggml_backend_sched_t get_sched() const; - ggml_context * get_ctx_compute() const; - uint32_t n_ctx() const; uint32_t n_ctx_per_seq() const; uint32_t n_batch() const; @@ -96,7 +94,7 @@ struct llama_context { // if memory_context is provided, it will be applied first to the context's memory // ret contains the status of the graph computation // returns nullptr only if ret != GGML_STATUS_SUCCESS - llm_graph_result_ptr process_ubatch( + llm_graph_result_i * process_ubatch( const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, @@ -188,10 +186,7 @@ struct llama_context { // public: - int32_t graph_max_nodes() const; - - // zero-out inputs and create the ctx_compute for the compute graph - ggml_cgraph * graph_init(); + uint32_t graph_max_nodes() const; // returns the result of ggml_backend_sched_graph_compute_async execution ggml_status graph_compute(ggml_cgraph * gf, bool batched); @@ -200,12 +195,11 @@ struct llama_context { ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx); private: - llm_graph_result_ptr graph_build( - ggml_context * ctx, - ggml_cgraph * gf, - const llama_ubatch & ubatch, - llm_graph_type gtype, - const llama_memory_context_i * mctx); + llm_graph_params graph_params( + llm_graph_result_i * res, + const llama_ubatch & ubatch, + const llama_memory_context_i * mctx, + llm_graph_type gtype) const; llm_graph_cb graph_get_cb() const; @@ -258,8 +252,6 @@ struct llama_context { ggml_backend_t backend_cpu = nullptr; std::vector backends; - ggml_context_ptr ctx_compute; - // training ggml_opt_context_t opt_ctx = nullptr; @@ -275,8 +267,8 @@ struct llama_context { std::vector backend_ptrs; std::vector backend_buft; - // memory buffers used to evaluate the model - std::vector buf_compute_meta; + llm_graph_result_ptr gf_res_prev; + llm_graph_result_ptr gf_res_reserve; // host buffer for the model output (logits and embeddings) ggml_backend_buffer_ptr buf_output; @@ -294,4 +286,6 @@ struct llama_context { mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) mutable int32_t n_eval = 0; // number of eval calls + + mutable int32_t n_reused = 0; // number of times the previous graph was reused }; diff --git a/src/llama-cparams.cpp b/src/llama-cparams.cpp index a3e7a37ee36d7..f81cd10823d3b 100644 --- a/src/llama-cparams.cpp +++ b/src/llama-cparams.cpp @@ -3,3 +3,32 @@ size_t llama_max_parallel_sequences(void) { return LLAMA_MAX_SEQ; } + +bool llama_cparams::is_same(const llama_cparams & other) const { + return + n_ctx == other.n_ctx && + n_batch == other.n_batch && + n_ubatch == other.n_ubatch && + n_seq_max == other.n_seq_max && + n_threads == other.n_threads && + n_threads_batch == other.n_threads_batch && + rope_freq_base == other.rope_freq_base && + rope_freq_scale == other.rope_freq_scale && + n_ctx_orig_yarn == other.n_ctx_orig_yarn && + yarn_ext_factor == other.yarn_ext_factor && + yarn_attn_factor == other.yarn_attn_factor && + yarn_beta_fast == other.yarn_beta_fast && + yarn_beta_slow == other.yarn_beta_slow && + defrag_thold == other.defrag_thold && + embeddings == other.embeddings && + causal_attn == other.causal_attn && + offload_kqv == other.offload_kqv && + flash_attn == other.flash_attn && + no_perf == other.no_perf && + warmup == other.warmup && + op_offload == other.op_offload && + graph_reuse == other.graph_reuse && + pooling_type == other.pooling_type && + cb_eval == other.cb_eval && + cb_eval_user_data == other.cb_eval_user_data; +} diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 118615d5bd2d5..064767d51d92c 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -33,9 +33,12 @@ struct llama_cparams { bool no_perf; bool warmup; bool op_offload; + bool graph_reuse; enum llama_pooling_type pooling_type; ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; + + bool is_same(const llama_cparams & other) const; }; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 7f0e8c67f1325..264491b4de83a 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -28,6 +28,15 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) { } } +bool llm_graph_input_embd::update(const llm_graph_params & params) { + bool res = true; + + res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens); + res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[0] == params.ubatch.n_tokens); + + return res; +} + void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && pos) { const int64_t n_tokens = ubatch->n_tokens; @@ -50,6 +59,14 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { } } +bool llm_graph_input_pos::update(const llm_graph_params & params) { + bool res = true; + + res &= pos->ne[0] == params.ubatch.n_tokens; + + return res; +} + void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && attn_scale) { const int64_t n_tokens = ubatch->n_tokens; @@ -118,6 +135,14 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { } } +bool llm_graph_input_out_ids::update(const llm_graph_params & params) { + bool res = true; + + res &= n_outputs == params.n_outputs; + + return res; +} + void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) { if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { const int64_t n_tokens = ubatch->n_tokens; @@ -287,6 +312,24 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) { mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); } +bool llm_graph_input_attn_kv_unified::update(const llm_graph_params & params) { + const auto * mctx = static_cast(params.mctx); + + this->mctx = mctx; + + bool res = true; + + res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; + res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; + + res &= self_kq_mask->ne[0] == mctx->get_n_kv(); + res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD); + + res &= mctx->get_supports_set_rows(); // TODO: tmp + + return res; +} + void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) { mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); @@ -299,6 +342,30 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); } +bool llm_graph_input_attn_kv_unified_iswa::update(const llm_graph_params & params) { + const auto * mctx = static_cast(params.mctx); + + this->mctx = mctx; + + bool res = true; + + res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; + res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; + + res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; + res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; + + res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv(); + res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD); + + res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv(); + res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD); + + res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp + + return res; +} + void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { GGML_ASSERT(cross_kq_mask); @@ -395,7 +462,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : n_ctx_orig (cparams.n_ctx_orig_yarn), pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), - ctx0 (params.ctx), sched (params.sched), backend_cpu (params.backend_cpu), cvec (params.cvec), @@ -403,7 +469,8 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : mctx (params.mctx), cross (params.cross), cb_func (params.cb), - res (std::make_unique()) { + res (static_cast(params.res)), + ctx0 (res->get_ctx()) { } void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const { diff --git a/src/llama-graph.h b/src/llama-graph.h index 7bdf656768a0c..ae2fd6481b609 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -1,6 +1,7 @@ #pragma once #include "llama-arch.h" +#include "llama-batch.h" #include "llama-hparams.h" #include "llama-adapter.h" @@ -14,7 +15,6 @@ struct ggml_cgraph; struct ggml_context; struct ggml_tensor; -struct llama_ubatch; struct llama_cparams; struct llama_memory_context_i; @@ -69,6 +69,8 @@ struct llama_cross { std::vector> seq_ids_enc; }; +struct llm_graph_params; + // // llm_graph_input // @@ -78,11 +80,19 @@ class llm_graph_input_i { virtual ~llm_graph_input_i() = default; virtual void set_input(const llama_ubatch * ubatch) = 0; + + // return true if the resulting input tensors using the provided graph parameters would be + // the same as the previous input tensors that we have currently stored in the object + virtual bool update(const llm_graph_params & params) { + // returning false here by default will prevent from reusing the graph if the check + // for the input type has not been implemented yet + GGML_UNUSED(params); + return false; + } }; using llm_graph_input_ptr = std::unique_ptr; - class llm_graph_input_embd : public llm_graph_input_i { public: llm_graph_input_embd() = default; @@ -90,6 +100,8 @@ class llm_graph_input_embd : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; + bool update(const llm_graph_params & params) override; + ggml_tensor * tokens = nullptr; // I32 [n_batch] ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] }; @@ -101,6 +113,8 @@ class llm_graph_input_pos : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; + bool update(const llm_graph_params & params) override; + ggml_tensor * pos = nullptr; // I32 [n_batch] const uint32_t n_pos_per_embd = 1; @@ -154,17 +168,19 @@ class llm_graph_input_out_ids : public llm_graph_input_i { llm_graph_input_out_ids( const llama_hparams & hparams, const llama_cparams & cparams, - int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} + uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} virtual ~llm_graph_input_out_ids() = default; void set_input(const llama_ubatch * ubatch) override; + bool update(const llm_graph_params & params) override; + ggml_tensor * out_ids; // I32 [n_outputs] const llama_hparams & hparams; const llama_cparams & cparams; - const int32_t n_outputs; + const uint32_t n_outputs; }; class llm_graph_input_mean : public llm_graph_input_i { @@ -249,6 +265,8 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; + bool update(const llm_graph_params & params) override; + ggml_tensor * get_k_idxs() const { return self_k_idxs; } ggml_tensor * get_v_idxs() const { return self_v_idxs; } @@ -280,6 +298,8 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i { void set_input(const llama_ubatch * ubatch) override; + bool update(const llm_graph_params & params) override; + ggml_tensor * get_k_idxs() const { return self_k_idxs; } ggml_tensor * get_v_idxs() const { return self_v_idxs; } ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; } @@ -373,29 +393,110 @@ class llm_graph_input_one : public llm_graph_input_i { // along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc. // these are used by the llama_context to extact the relevant data, based on the compute parameters +// TODO: this interface seems redundant - remove it class llm_graph_result_i { public: virtual ~llm_graph_result_i() = default; - virtual ggml_tensor * get_tokens() = 0; - virtual ggml_tensor * get_logits() = 0; - virtual ggml_tensor * get_embd() = 0; - virtual ggml_tensor * get_embd_pooled() = 0; + virtual ggml_tensor * get_tokens() const = 0; + virtual ggml_tensor * get_logits() const = 0; + virtual ggml_tensor * get_embd() const = 0; + virtual ggml_tensor * get_embd_pooled() const = 0; + + virtual ggml_cgraph * get_gf() = 0; + virtual ggml_context * get_ctx() = 0; + + virtual void reset() = 0; virtual void set_inputs(const llama_ubatch * ubatch) = 0; + + virtual bool update(const llm_graph_params & params) = 0; }; using llm_graph_result_ptr = std::unique_ptr; +// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) +using llm_graph_cb = std::function; + +struct llm_graph_params { + llm_arch arch = LLM_ARCH_UNKNOWN; + + llama_hparams hparams; + llama_cparams cparams; + + llama_ubatch ubatch; // note: intentionally make a copy + + llm_graph_type gtype; + + ggml_backend_sched_t sched; + ggml_backend_t backend_cpu; + + const llama_adapter_cvec * cvec; + const llama_adapter_loras * loras; + const llama_memory_context_i * mctx; + const llama_cross * cross; + + uint32_t n_outputs; + + llm_graph_cb cb; + + // TODO: temporary + llm_graph_result_i * res; + + bool is_same(const llm_graph_params & other) const { + return + hparams.is_same(other.hparams) && + cparams.is_same(other.cparams) && + ubatch .is_same(other.ubatch) && + arch == other.arch && + gtype == other.gtype && + cvec == other.cvec && + loras == other.loras && + cross == other.cross && + n_outputs == other.n_outputs; + } +}; class llm_graph_result : public llm_graph_result_i { public: + llm_graph_result(int64_t max_nodes) : max_nodes(max_nodes) { + reset(); + } + virtual ~llm_graph_result() = default; - ggml_tensor * get_tokens() override { return t_tokens; } - ggml_tensor * get_logits() override { return t_logits; } - ggml_tensor * get_embd() override { return t_embd; } - ggml_tensor * get_embd_pooled() override { return t_embd_pooled; } + ggml_tensor * get_tokens() const override { return t_tokens; } + ggml_tensor * get_logits() const override { return t_logits; } + ggml_tensor * get_embd() const override { return t_embd; } + ggml_tensor * get_embd_pooled() const override { return t_embd_pooled; } + + ggml_cgraph * get_gf() override { return gf; } + ggml_context * get_ctx() override { return ctx_compute.get(); } + + void set_max_nodes(int64_t max_nodes) { + this->max_nodes = max_nodes; + } + + void reset() override { + t_tokens = nullptr; + t_logits = nullptr; + t_embd = nullptr; + t_embd_pooled = nullptr; + + inputs.clear(); + + buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + + ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ctx_compute.reset(ggml_init(params)); + + gf = ggml_new_graph_custom(ctx_compute.get(), max_nodes, false); + } void set_inputs(const llama_ubatch * ubatch) override { for (auto & input : inputs) { @@ -403,6 +504,25 @@ class llm_graph_result : public llm_graph_result_i { } } + // try to update the existing graph result using the new graph parameters + // this can only be done if we determine that the resulting graph using the new graph parameters + // would be identical to the existing graph. in that case, we simply have to update the memory + // contexts of the input tensors of the graph and we can reuse it for another computation + // return true if the graph was updated and can be reused + bool update(const llm_graph_params & params) override { + if (!this->params.is_same(params)) { + return false; + } + + bool res = true; + + for (auto & input : inputs) { + res &= input->update(params); + } + + return res; + } + llm_graph_input_i * add_input(llm_graph_input_ptr input) { inputs.emplace_back(std::move(input)); return inputs.back().get(); @@ -415,37 +535,26 @@ class llm_graph_result : public llm_graph_result_i { ggml_tensor * t_embd_pooled = nullptr; std::vector inputs; -}; -// -// llm_graph_context -// + ggml_context_ptr ctx_compute; -// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) -using llm_graph_cb = std::function; + // memory buffers used to evaluate the model + std::vector buf_compute_meta; -struct llm_graph_params { - ggml_context * ctx; + ggml_cgraph * gf; - const llm_arch arch; + int64_t max_nodes; - const llama_hparams & hparams; - const llama_cparams & cparams; - const llama_ubatch & ubatch; - - ggml_backend_sched_t sched; - ggml_backend_t backend_cpu; - - const llama_adapter_cvec * cvec; - const llama_adapter_loras * loras; - const llama_memory_context_i * mctx; - const llama_cross * cross; - - uint32_t n_outputs; - - const llm_graph_cb & cb; + // keep a copy of the previous graph parameters + // we will use this to determine whether the graph can be reused by comparing them with the new parameters + // note: these are updated after constructing the new graph + llm_graph_params params; }; +// +// llm_graph_context +// + // used in build_rs to properly order writes and avoid unnecessary copies using llm_graph_get_rows_fn = std::function; @@ -485,8 +594,6 @@ struct llm_graph_context { const enum llama_pooling_type pooling_type; const enum llama_rope_type rope_type; - ggml_context * ctx0 = nullptr; - ggml_backend_sched_t sched; ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove? @@ -498,7 +605,9 @@ struct llm_graph_context { const llm_graph_cb & cb_func; - std::unique_ptr res; + llm_graph_result * res; + + ggml_context * ctx0 = nullptr; llm_graph_context(const llm_graph_params & params); virtual ~llm_graph_context() = default; diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 86c814d51b901..a59e96a92fe7e 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -102,3 +102,12 @@ bool llama_hparams::is_swa(uint32_t il) const { GGML_ABORT("fatal error"); } + +bool llama_hparams::is_same(const llama_hparams & other) const { + return + n_ctx_train == other.n_ctx_train && + n_embd == other.n_embd && + n_layer == other.n_layer && + n_expert == other.n_expert && + n_expert_used == other.n_expert_used; +} diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 476d0a5eade28..dfddb4f3d9534 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -202,6 +202,8 @@ struct llama_hparams { uint32_t n_pos_per_embd() const; bool is_swa(uint32_t il) const; + + bool is_same(const llama_hparams & other) const; }; static_assert(std::is_trivially_copyable::value, "llama_hparams must be trivially copyable"); diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index d3129cc53281e..e91b92b6095e1 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -68,6 +68,8 @@ llama_kv_cache_unified::llama_kv_cache_unified( cells.resize(kv_size); + gf_res.reset(new llm_graph_result(32768)); // note: the max nodes will be updated later + for (uint32_t il = 0; il < n_layer_cache; il++) { if (filter && !filter(il)) { LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il); @@ -158,7 +160,7 @@ llama_kv_cache_unified::llama_kv_cache_unified( debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0; const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS"); - supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0; + supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0; if (!supports_set_rows) { LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__); @@ -480,14 +482,12 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { ggml_backend_sched_reset(sched); - auto * gf = lctx->graph_init(); + auto * res = gf_res.get(); - auto res = build_graph_shift(lctx->get_cparams(), lctx->get_ctx_compute(), gf); - if (!res) { - LLAMA_LOG_ERROR("%s: failed to build graph for K-shift\n", __func__); - return updated; - } + res->set_max_nodes(lctx->graph_max_nodes()); + res->reset(); + auto * gf = build_graph_shift(res, lctx); if (!ggml_backend_sched_alloc_graph(sched, gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__); return updated; @@ -529,14 +529,12 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d ggml_backend_sched_reset(sched); - auto * gf = lctx->graph_init(); + auto * res = gf_res.get(); - auto res = build_graph_defrag(lctx->get_cparams(), lctx->get_ctx_compute(), gf, dinfo); - if (!res) { - LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__); - return updated; - } + res->set_max_nodes(lctx->graph_max_nodes()); + res->reset(); + auto * gf = build_graph_defrag(res, lctx, dinfo); if (!ggml_backend_sched_alloc_graph(sched, gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__); return updated; @@ -780,6 +778,10 @@ uint32_t llama_kv_cache_unified::get_n_kv() const { return std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))); } +bool llama_kv_cache_unified::get_supports_set_rows() const { + return supports_set_rows; +} + ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const { const int32_t ikv = map_layer_ids.at(il); @@ -1142,11 +1144,9 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) { } } -llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( - const llama_cparams & cparams, - ggml_context * ctx, - ggml_cgraph * gf) const { - auto res = std::make_unique(); +ggml_cgraph * llama_kv_cache_unified::build_graph_shift(llm_graph_result * res, llama_context * lctx) const { + auto * ctx = res->get_ctx(); + auto * gf = res->get_gf(); const auto & n_embd_head_k = hparams.n_embd_head_k; //const auto & n_embd_head_v = hparams.n_embd_head_v; @@ -1156,6 +1156,8 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cells.size()); ggml_set_input(inp->k_shift); + const auto & cparams = lctx->get_cparams(); + for (const auto & layer : layers) { const uint32_t il = layer.il; @@ -1181,18 +1183,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( res->add_input(std::move(inp)); - return res; + return gf; } -llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( - const llama_cparams & cparams, - ggml_context * ctx, - ggml_cgraph * gf, - const defrag_info & dinfo) const { - auto res = std::make_unique(); +ggml_cgraph * llama_kv_cache_unified::build_graph_defrag( + llm_graph_result * res, + llama_context * lctx, + const defrag_info & dinfo) const { + auto * ctx = res->get_ctx(); + auto * gf = res->get_gf(); const auto & ids = dinfo.ids; + const auto & cparams = lctx->get_cparams(); + #if 0 // CPU defrag // @@ -1329,7 +1333,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); #endif - return res; + return gf; } llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const { @@ -1940,6 +1944,10 @@ uint32_t llama_kv_cache_unified_context::get_n_kv() const { return n_kv; } +bool llama_kv_cache_unified_context::get_supports_set_rows() const { + return kv->get_supports_set_rows(); +} + ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const { return kv->get_k(ctx, il, n_kv); } diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache-unified.h index b8b0356e830c8..3eff137fee953 100644 --- a/src/llama-kv-cache-unified.h +++ b/src/llama-kv-cache-unified.h @@ -121,6 +121,9 @@ class llama_kv_cache_unified : public llama_memory_i { uint32_t get_n_kv() const; + // TODO: temporary + bool get_supports_set_rows() const; + // get views of the current state of the cache ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const; ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const; @@ -193,13 +196,15 @@ class llama_kv_cache_unified : public llama_memory_i { // env: LLAMA_SET_ROWS (temporary) // ref: https://github.com/ggml-org/llama.cpp/pull/14285 - int supports_set_rows = false; + bool supports_set_rows = false; const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; std::vector ctxs; std::vector bufs; + std::unique_ptr gf_res; + llama_kv_cells_unified cells; std::vector layers; @@ -226,15 +231,13 @@ class llama_kv_cache_unified : public llama_memory_i { float freq_base, float freq_scale) const; - llm_graph_result_ptr build_graph_shift( - const llama_cparams & cparams, - ggml_context * ctx, - ggml_cgraph * gf) const; + ggml_cgraph * build_graph_shift( + llm_graph_result * res, + llama_context * lctx) const; - llm_graph_result_ptr build_graph_defrag( - const llama_cparams & cparams, - ggml_context * ctx, - ggml_cgraph * gf, + ggml_cgraph * build_graph_defrag( + llm_graph_result * res, + llama_context * lctx, const defrag_info & dinfo) const; void state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) const; @@ -288,6 +291,9 @@ class llama_kv_cache_unified_context : public llama_memory_context_i { uint32_t get_n_kv() const; + // TODO: temporary + bool get_supports_set_rows() const; + // get views of the current state of the cache ggml_tensor * get_k(ggml_context * ctx, int32_t il) const; ggml_tensor * get_v(ggml_context * ctx, int32_t il) const; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0573c5bcea0a4..caab6a0d270be 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14751,10 +14751,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, return res; } -llm_graph_result_ptr llama_model::build_graph( - const llm_graph_params & params, - ggml_cgraph * gf, - llm_graph_type type) const { +ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { + // TODO: temporary - will refactor this to keep the "gf" instance in the llm_graph_context and avoid passing it everywhere + auto * gf = params.res->get_gf(); + std::unique_ptr llm; switch (arch) { @@ -14961,7 +14961,7 @@ llm_graph_result_ptr llama_model::build_graph( } break; case LLM_ARCH_T5: { - switch (type) { + switch (params.gtype) { case LLM_GRAPH_TYPE_ENCODER: llm = std::make_unique(*this, params, gf); break; @@ -15047,7 +15047,10 @@ llm_graph_result_ptr llama_model::build_graph( // add on pooling layer llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b); - return std::move(llm->res); + // TODO: updating the graph parameters here is a little bit obscure - figure out something better + llm->res->params = params; + + return llm->res->get_gf(); } // diff --git a/src/llama-model.h b/src/llama-model.h index 979fff62045f9..9bf9cf327dab9 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -436,10 +436,7 @@ struct llama_model { llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const; // TODO: move this to new llm_arch_model_i interface - llm_graph_result_ptr build_graph( - const llm_graph_params & params, - ggml_cgraph * gf, - llm_graph_type type) const; + ggml_cgraph * build_graph(const llm_graph_params & params) const; private: struct impl; diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index b80e984d0245b..ad74dbaf27a11 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -261,6 +261,7 @@ struct cmd_params { std::vector use_mmap; std::vector embeddings; std::vector no_op_offload; + std::vector graph_reuse; ggml_numa_strategy numa; int reps; ggml_sched_priority prio; @@ -298,6 +299,7 @@ static const cmd_params cmd_params_defaults = { /* use_mmap */ { true }, /* embeddings */ { false }, /* no_op_offload */ { false }, + /* graph_reuse */ { false }, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* prio */ GGML_SCHED_PRIO_NORMAL, @@ -377,6 +379,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -ot --override-tensors =;...\n"); printf(" (default: disabled)\n"); printf(" -nopo, --no-op-offload <0|1> (default: 0)\n"); + printf(" -gr, --graph-reuse <0|1> (default: 0)\n"); printf("\n"); printf( "Multiple values can be given for each parameter by separating them with ','\n" @@ -620,6 +623,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); + } else if (arg == "-gr" || arg == "--graph-reuse") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.graph_reuse.insert(params.graph_reuse.end(), p.begin(), p.end()); } else if (arg == "--numa") { if (++i >= argc) { invalid_param = true; @@ -885,6 +895,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.no_op_offload.empty()) { params.no_op_offload = cmd_params_defaults.no_op_offload; } + if (params.graph_reuse.empty()) { + params.graph_reuse = cmd_params_defaults.graph_reuse; + } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } @@ -926,6 +939,7 @@ struct cmd_params_instance { bool use_mmap; bool embeddings; bool no_op_offload; + bool graph_reuse; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); @@ -998,6 +1012,7 @@ struct cmd_params_instance { cparams.embeddings = embeddings; cparams.op_offload = !no_op_offload; cparams.swa_full = false; + cparams.graph_reuse = graph_reuse; return cparams; } @@ -1018,6 +1033,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & mmp : params.use_mmap) for (const auto & embd : params.embeddings) for (const auto & nopo : params.no_op_offload) + for (const auto & gr : params.graph_reuse) for (const auto & nb : params.n_batch) for (const auto & nub : params.n_ubatch) for (const auto & tk : params.type_k) @@ -1059,6 +1075,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .use_mmap = */ mmp, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, + /* .graph_reuse = */ gr, }; instances.push_back(instance); } @@ -1092,6 +1109,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .use_mmap = */ mmp, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, + /* .graph_reuse = */ gr, }; instances.push_back(instance); } @@ -1125,6 +1143,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .use_mmap = */ mmp, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, + /* .graph_reuse = */ gr, }; instances.push_back(instance); } @@ -1162,6 +1181,7 @@ struct test { bool use_mmap; bool embeddings; bool no_op_offload; + bool graph_reuse; int n_prompt; int n_gen; int n_depth; @@ -1197,6 +1217,7 @@ struct test { use_mmap = inst.use_mmap; embeddings = inst.embeddings; no_op_offload = inst.no_op_offload; + graph_reuse = inst.graph_reuse; n_prompt = inst.n_prompt; n_gen = inst.n_gen; n_depth = inst.n_depth; @@ -1243,8 +1264,8 @@ struct test { "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", "defrag_thold", - "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", - "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", + "use_mmap", "embeddings", "no_op_offload", "graph_reuse", "n_prompt", "n_gen", "n_depth", + "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", }; return fields; } @@ -1259,7 +1280,7 @@ struct test { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || - field == "use_mmap" || field == "embeddings") { + field == "use_mmap" || field == "embeddings" || field == "graph_reuse") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") { @@ -1333,6 +1354,7 @@ struct test { std::to_string(use_mmap), std::to_string(embeddings), std::to_string(no_op_offload), + std::to_string(graph_reuse), std::to_string(n_prompt), std::to_string(n_gen), std::to_string(n_depth), @@ -1518,6 +1540,9 @@ struct markdown_printer : public printer { if (field == "no_op_offload") { return 4; } + if (field == "graph_reuse") { + return 4; + } int width = std::max((int) field.length(), 10); @@ -1552,6 +1577,9 @@ struct markdown_printer : public printer { if (field == "no_op_offload") { return "nopo"; } + if (field == "graph_reuse") { + return "gr"; + } if (field == "tensor_split") { return "ts"; } @@ -1626,6 +1654,9 @@ struct markdown_printer : public printer { if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) { fields.emplace_back("no_op_offload"); } + if (params.graph_reuse.size() > 1 || params.graph_reuse != cmd_params_defaults.graph_reuse) { + fields.emplace_back("graph_reuse"); + } fields.emplace_back("test"); fields.emplace_back("t/s");