@@ -214,8 +214,10 @@ void llama_kv_cache_unified::clear(bool data) {
214
214
}
215
215
216
216
bool llama_kv_cache_unified::seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
217
- auto & cells = v_cells[seq_to_stream.at (seq_id)];
218
- auto & head = v_heads[seq_to_stream.at (seq_id)];
217
+ GGML_ASSERT (seq_id >= 0 && (size_t ) seq_id < seq_to_stream.size ());
218
+
219
+ auto & cells = v_cells[seq_to_stream[seq_id]];
220
+ auto & head = v_heads[seq_to_stream[seq_id]];
219
221
220
222
uint32_t new_head = cells.size ();
221
223
@@ -263,8 +265,11 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
263
265
}
264
266
265
267
void llama_kv_cache_unified::seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
266
- const auto s0 = seq_to_stream.at (seq_id_src);
267
- const auto s1 = seq_to_stream.at (seq_id_dst);
268
+ GGML_ASSERT (seq_id_src >= 0 && (size_t ) seq_id_src < seq_to_stream.size ());
269
+ GGML_ASSERT (seq_id_dst >= 0 && (size_t ) seq_id_dst < seq_to_stream.size ());
270
+
271
+ const auto s0 = seq_to_stream[seq_id_src];
272
+ const auto s1 = seq_to_stream[seq_id_dst];
268
273
269
274
if (s0 == s1) {
270
275
// since both sequences are in the same stream, no data copy is necessary
@@ -343,8 +348,10 @@ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id
343
348
}
344
349
345
350
void llama_kv_cache_unified::seq_keep (llama_seq_id seq_id) {
346
- auto & cells = v_cells[seq_to_stream.at (seq_id)];
347
- auto & head = v_heads[seq_to_stream.at (seq_id)];
351
+ GGML_ASSERT (seq_id >= 0 && (size_t ) seq_id < seq_to_stream.size ());
352
+
353
+ auto & cells = v_cells[seq_to_stream[seq_id]];
354
+ auto & head = v_heads[seq_to_stream[seq_id]];
348
355
349
356
uint32_t new_head = cells.size ();
350
357
@@ -363,8 +370,10 @@ void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
363
370
}
364
371
365
372
void llama_kv_cache_unified::seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
366
- auto & cells = v_cells[seq_to_stream.at (seq_id)];
367
- auto & head = v_heads[seq_to_stream.at (seq_id)];
373
+ GGML_ASSERT (seq_id >= 0 && (size_t ) seq_id < seq_to_stream.size ());
374
+
375
+ auto & cells = v_cells[seq_to_stream[seq_id]];
376
+ auto & head = v_heads[seq_to_stream[seq_id]];
368
377
369
378
if (shift == 0 ) {
370
379
return ;
@@ -405,7 +414,9 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
405
414
}
406
415
407
416
void llama_kv_cache_unified::seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
408
- auto & cells = v_cells[seq_to_stream.at (seq_id)];
417
+ GGML_ASSERT (seq_id >= 0 && (size_t ) seq_id < seq_to_stream.size ());
418
+
419
+ auto & cells = v_cells[seq_to_stream[seq_id]];
409
420
410
421
if (d == 1 ) {
411
422
return ;
@@ -436,13 +447,17 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po
436
447
}
437
448
438
449
llama_pos llama_kv_cache_unified::seq_pos_min (llama_seq_id seq_id) const {
439
- const auto & cells = v_cells[seq_to_stream.at (seq_id)];
450
+ GGML_ASSERT (seq_id >= 0 && (size_t ) seq_id < seq_to_stream.size ());
451
+
452
+ const auto & cells = v_cells[seq_to_stream[seq_id]];
440
453
441
454
return cells.seq_pos_min (seq_id);
442
455
}
443
456
444
457
llama_pos llama_kv_cache_unified::seq_pos_max (llama_seq_id seq_id) const {
445
- const auto & cells = v_cells[seq_to_stream.at (seq_id)];
458
+ GGML_ASSERT (seq_id >= 0 && (size_t ) seq_id < seq_to_stream.size ());
459
+
460
+ const auto & cells = v_cells[seq_to_stream[seq_id]];
446
461
447
462
return cells.seq_pos_max (seq_id);
448
463
}
@@ -606,8 +621,11 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
606
621
const size_t n_copy = sc_info.ssrc .size ();
607
622
608
623
for (size_t i = 0 ; i < n_copy; ++i) {
609
- const auto ssrc = sc_info.ssrc .at (i);
610
- const auto sdst = sc_info.sdst .at (i);
624
+ const auto ssrc = sc_info.ssrc [i];
625
+ const auto sdst = sc_info.sdst [i];
626
+
627
+ assert (ssrc < n_stream);
628
+ assert (sdst < n_stream);
611
629
612
630
LLAMA_LOG_DEBUG (" %s: copying KV buffer: stream %d to stream %d\n " , __func__, ssrc, sdst);
613
631
@@ -616,8 +634,8 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
616
634
for (uint32_t il = 0 ; il < layers.size (); ++il) {
617
635
const auto & layer = layers[il];
618
636
619
- ggml_backend_tensor_copy (layer.k_stream . at ( ssrc) , layer.k_stream . at ( sdst) );
620
- ggml_backend_tensor_copy (layer.v_stream . at ( ssrc) , layer.v_stream . at ( sdst) );
637
+ ggml_backend_tensor_copy (layer.k_stream [ ssrc] , layer.k_stream [ sdst] );
638
+ ggml_backend_tensor_copy (layer.v_stream [ ssrc] , layer.v_stream [ sdst] );
621
639
}
622
640
}
623
641
}
@@ -927,7 +945,7 @@ void llama_kv_cache_unified::apply_ubatch(const slot_info & sinfo, const llama_u
927
945
928
946
auto & cells = v_cells[sinfo.strm [s]];
929
947
930
- const auto idx = sinfo.idxs . at (s). at (ii) ;
948
+ const auto idx = sinfo.idxs [s][ii] ;
931
949
932
950
if (!cells.is_empty (idx)) {
933
951
assert (cells.seq_count (idx) == 1 );
@@ -1189,7 +1207,7 @@ void llama_kv_cache_unified::set_input_k_idxs(ggml_tensor * dst, const llama_uba
1189
1207
const int64_t offs = sinfo.strm [s]*get_size ();
1190
1208
1191
1209
for (uint32_t i = 0 ; i < sinfo.size (); ++i) {
1192
- data[s*sinfo.size () + i] = offs + sinfo.idxs . at (s). at (i) ;
1210
+ data[s*sinfo.size () + i] = offs + sinfo.idxs [s][i] ;
1193
1211
}
1194
1212
}
1195
1213
}
@@ -1210,7 +1228,7 @@ void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_uba
1210
1228
const int64_t offs = sinfo.strm [s]*get_size ();
1211
1229
1212
1230
for (uint32_t i = 0 ; i < sinfo.size (); ++i) {
1213
- data[s*sinfo.size () + i] = offs + sinfo.idxs . at (s). at (i) ;
1231
+ data[s*sinfo.size () + i] = offs + sinfo.idxs [s][i] ;
1214
1232
}
1215
1233
}
1216
1234
} else {
@@ -1224,7 +1242,7 @@ void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_uba
1224
1242
1225
1243
for (uint32_t i = 0 ; i < sinfo.size (); ++i) {
1226
1244
for (uint32_t j = 0 ; j < n_embd_v_gqa; ++j) {
1227
- data[s*sinfo.size ()*n_embd_v_gqa + i*n_embd_v_gqa + j] = offs + j*kv_size + sinfo.idxs . at (s). at (i) ;
1245
+ data[s*sinfo.size ()*n_embd_v_gqa + i*n_embd_v_gqa + j] = offs + j*kv_size + sinfo.idxs [s][i] ;
1228
1246
}
1229
1247
}
1230
1248
}
@@ -1847,6 +1865,8 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
1847
1865
}
1848
1866
1849
1867
void llama_kv_cache_unified::state_read (llama_io_read_i & io, llama_seq_id seq_id) {
1868
+ GGML_ASSERT (seq_id == -1 || (seq_id >= 0 && (size_t ) seq_id < seq_to_stream.size ()));
1869
+
1850
1870
uint32_t n_stream_cur;
1851
1871
io.read_to (&n_stream_cur, sizeof (n_stream_cur));
1852
1872
if (n_stream_cur != n_stream) {
@@ -1861,7 +1881,7 @@ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_i
1861
1881
continue ;
1862
1882
}
1863
1883
1864
- const uint32_t strm = seq_id == -1 ? s : seq_to_stream. at ( seq_id) ;
1884
+ const uint32_t strm = seq_id == -1 ? s : seq_to_stream[ seq_id] ;
1865
1885
1866
1886
bool res = true ;
1867
1887
res = res && state_read_meta (io, strm, cell_count, seq_id);
0 commit comments