@@ -311,14 +311,9 @@ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id
311
311
312
312
GGML_ASSERT (is_full && " seq_cp() is only supported for full KV buffers" );
313
313
314
- // LLAMA_LOG_WARN("%s: copying KV buffer from %d (stream = %d) to %d (stream = %d)\n", __func__, seq_id_src, s0, seq_id_dst, s1);
315
-
316
- for (uint32_t il = 0 ; il < layers.size (); ++il) {
317
- const auto & layer = layers[il];
318
-
319
- ggml_backend_tensor_copy (layer.k_stream [s0], layer.k_stream [s1]);
320
- ggml_backend_tensor_copy (layer.v_stream [s0], layer.v_stream [s1]);
321
- }
314
+ // enqueue the copy operation - the buffer copy will be performed during the next update
315
+ sc_info.ssrc .push_back (s0);
316
+ sc_info.sdst .push_back (s1);
322
317
323
318
v_cells[s1].reset ();
324
319
for (uint32_t i = 0 ; i < v_cells[s0].size (); ++i) {
@@ -526,7 +521,7 @@ llama_memory_context_ptr llama_kv_cache_unified::init_update(llama_context * lct
526
521
}
527
522
}
528
523
529
- return std::make_unique<llama_kv_cache_unified_context>(this , lctx, do_shift, std::move (dinfo));
524
+ return std::make_unique<llama_kv_cache_unified_context>(this , lctx, do_shift, std::move (dinfo), std::move (sc_info) );
530
525
}
531
526
532
527
llama_kv_cache_unified::slot_info_vec_t llama_kv_cache_unified::prepare (const std::vector<llama_ubatch> & ubatches) {
@@ -598,11 +593,35 @@ llama_kv_cache_unified::slot_info_vec_t llama_kv_cache_unified::prepare(const st
598
593
return res;
599
594
}
600
595
601
- bool llama_kv_cache_unified::update (llama_context * lctx, bool do_shift, const defrag_info & dinfo) {
596
+ bool llama_kv_cache_unified::update (llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info ) {
602
597
bool updated = false ;
603
598
604
599
auto * sched = lctx->get_sched ();
605
600
601
+ if (!sc_info.empty ()) {
602
+ assert (n_stream > 1 && " stream copy should never happen with a single stream" );
603
+
604
+ llama_synchronize (lctx);
605
+
606
+ const size_t n_copy = sc_info.ssrc .size ();
607
+
608
+ for (size_t i = 0 ; i < n_copy; ++i) {
609
+ const auto ssrc = sc_info.ssrc .at (i);
610
+ const auto sdst = sc_info.sdst .at (i);
611
+
612
+ LLAMA_LOG_DEBUG (" %s: copying KV buffer: stream %d to stream %d\n " , __func__, ssrc, sdst);
613
+
614
+ assert (ssrc != sdst);
615
+
616
+ for (uint32_t il = 0 ; il < layers.size (); ++il) {
617
+ const auto & layer = layers[il];
618
+
619
+ ggml_backend_tensor_copy (layer.k_stream .at (ssrc), layer.k_stream .at (sdst));
620
+ ggml_backend_tensor_copy (layer.v_stream .at (ssrc), layer.v_stream .at (sdst));
621
+ }
622
+ }
623
+ }
624
+
606
625
if (do_shift) {
607
626
if (!get_can_shift ()) {
608
627
GGML_ABORT (" The current KV cache / model configuration does not support K-shift" );
@@ -2242,8 +2261,9 @@ llama_kv_cache_unified_context::llama_kv_cache_unified_context(
2242
2261
llama_kv_cache_unified * kv,
2243
2262
llama_context * lctx,
2244
2263
bool do_shift,
2245
- defrag_info dinfo) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)) {
2246
- if (!do_shift && this ->dinfo .empty ()) {
2264
+ defrag_info dinfo,
2265
+ stream_copy_info sc_info) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)), sc_info(std::move(sc_info)) {
2266
+ if (!do_shift && this ->dinfo .empty () && this ->sc_info .empty ()) {
2247
2267
status = LLAMA_MEMORY_STATUS_NO_UPDATE;
2248
2268
}
2249
2269
}
@@ -2271,7 +2291,7 @@ bool llama_kv_cache_unified_context::apply() {
2271
2291
2272
2292
// no ubatches -> this is a KV cache update
2273
2293
if (ubatches.empty ()) {
2274
- kv->update (lctx, do_shift, dinfo);
2294
+ kv->update (lctx, do_shift, dinfo, sc_info );
2275
2295
2276
2296
return true ;
2277
2297
}
0 commit comments