Skip to content

Commit ab82dc2

Browse files
committed
kv-cache : avoid throwing exceptions across the C boundary
ggml-ci
1 parent a86b929 commit ab82dc2

File tree

2 files changed

+44
-22
lines changed

2 files changed

+44
-22
lines changed

src/llama-kv-cache-unified.cpp

Lines changed: 40 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -214,8 +214,10 @@ void llama_kv_cache_unified::clear(bool data) {
214214
}
215215

216216
bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
217-
auto & cells = v_cells[seq_to_stream.at(seq_id)];
218-
auto & head = v_heads[seq_to_stream.at(seq_id)];
217+
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
218+
219+
auto & cells = v_cells[seq_to_stream[seq_id]];
220+
auto & head = v_heads[seq_to_stream[seq_id]];
219221

220222
uint32_t new_head = cells.size();
221223

@@ -263,8 +265,11 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
263265
}
264266

265267
void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
266-
const auto s0 = seq_to_stream.at(seq_id_src);
267-
const auto s1 = seq_to_stream.at(seq_id_dst);
268+
GGML_ASSERT(seq_id_src >= 0 && (size_t) seq_id_src < seq_to_stream.size());
269+
GGML_ASSERT(seq_id_dst >= 0 && (size_t) seq_id_dst < seq_to_stream.size());
270+
271+
const auto s0 = seq_to_stream[seq_id_src];
272+
const auto s1 = seq_to_stream[seq_id_dst];
268273

269274
if (s0 == s1) {
270275
// since both sequences are in the same stream, no data copy is necessary
@@ -343,8 +348,10 @@ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id
343348
}
344349

345350
void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
346-
auto & cells = v_cells[seq_to_stream.at(seq_id)];
347-
auto & head = v_heads[seq_to_stream.at(seq_id)];
351+
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
352+
353+
auto & cells = v_cells[seq_to_stream[seq_id]];
354+
auto & head = v_heads[seq_to_stream[seq_id]];
348355

349356
uint32_t new_head = cells.size();
350357

@@ -363,8 +370,10 @@ void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
363370
}
364371

365372
void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
366-
auto & cells = v_cells[seq_to_stream.at(seq_id)];
367-
auto & head = v_heads[seq_to_stream.at(seq_id)];
373+
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
374+
375+
auto & cells = v_cells[seq_to_stream[seq_id]];
376+
auto & head = v_heads[seq_to_stream[seq_id]];
368377

369378
if (shift == 0) {
370379
return;
@@ -405,7 +414,9 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
405414
}
406415

407416
void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
408-
auto & cells = v_cells[seq_to_stream.at(seq_id)];
417+
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
418+
419+
auto & cells = v_cells[seq_to_stream[seq_id]];
409420

410421
if (d == 1) {
411422
return;
@@ -436,13 +447,17 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po
436447
}
437448

438449
llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
439-
const auto & cells = v_cells[seq_to_stream.at(seq_id)];
450+
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
451+
452+
const auto & cells = v_cells[seq_to_stream[seq_id]];
440453

441454
return cells.seq_pos_min(seq_id);
442455
}
443456

444457
llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
445-
const auto & cells = v_cells[seq_to_stream.at(seq_id)];
458+
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
459+
460+
const auto & cells = v_cells[seq_to_stream[seq_id]];
446461

447462
return cells.seq_pos_max(seq_id);
448463
}
@@ -606,8 +621,11 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
606621
const size_t n_copy = sc_info.ssrc.size();
607622

608623
for (size_t i = 0; i < n_copy; ++i) {
609-
const auto ssrc = sc_info.ssrc.at(i);
610-
const auto sdst = sc_info.sdst.at(i);
624+
const auto ssrc = sc_info.ssrc[i];
625+
const auto sdst = sc_info.sdst[i];
626+
627+
assert(ssrc < n_stream);
628+
assert(sdst < n_stream);
611629

612630
LLAMA_LOG_DEBUG("%s: copying KV buffer: stream %d to stream %d\n", __func__, ssrc, sdst);
613631

@@ -616,8 +634,8 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
616634
for (uint32_t il = 0; il < layers.size(); ++il) {
617635
const auto & layer = layers[il];
618636

619-
ggml_backend_tensor_copy(layer.k_stream.at(ssrc), layer.k_stream.at(sdst));
620-
ggml_backend_tensor_copy(layer.v_stream.at(ssrc), layer.v_stream.at(sdst));
637+
ggml_backend_tensor_copy(layer.k_stream[ssrc], layer.k_stream[sdst]);
638+
ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
621639
}
622640
}
623641
}
@@ -927,7 +945,7 @@ void llama_kv_cache_unified::apply_ubatch(const slot_info & sinfo, const llama_u
927945

928946
auto & cells = v_cells[sinfo.strm[s]];
929947

930-
const auto idx = sinfo.idxs.at(s).at(ii);
948+
const auto idx = sinfo.idxs[s][ii];
931949

932950
if (!cells.is_empty(idx)) {
933951
assert(cells.seq_count(idx) == 1);
@@ -1189,7 +1207,7 @@ void llama_kv_cache_unified::set_input_k_idxs(ggml_tensor * dst, const llama_uba
11891207
const int64_t offs = sinfo.strm[s]*get_size();
11901208

11911209
for (uint32_t i = 0; i < sinfo.size(); ++i) {
1192-
data[s*sinfo.size() + i] = offs + sinfo.idxs.at(s).at(i);
1210+
data[s*sinfo.size() + i] = offs + sinfo.idxs[s][i];
11931211
}
11941212
}
11951213
}
@@ -1210,7 +1228,7 @@ void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_uba
12101228
const int64_t offs = sinfo.strm[s]*get_size();
12111229

12121230
for (uint32_t i = 0; i < sinfo.size(); ++i) {
1213-
data[s*sinfo.size() + i] = offs + sinfo.idxs.at(s).at(i);
1231+
data[s*sinfo.size() + i] = offs + sinfo.idxs[s][i];
12141232
}
12151233
}
12161234
} else {
@@ -1224,7 +1242,7 @@ void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_uba
12241242

12251243
for (uint32_t i = 0; i < sinfo.size(); ++i) {
12261244
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1227-
data[s*sinfo.size()*n_embd_v_gqa + i*n_embd_v_gqa + j] = offs + j*kv_size + sinfo.idxs.at(s).at(i);
1245+
data[s*sinfo.size()*n_embd_v_gqa + i*n_embd_v_gqa + j] = offs + j*kv_size + sinfo.idxs[s][i];
12281246
}
12291247
}
12301248
}
@@ -1847,6 +1865,8 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
18471865
}
18481866

18491867
void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
1868+
GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
1869+
18501870
uint32_t n_stream_cur;
18511871
io.read_to(&n_stream_cur, sizeof(n_stream_cur));
18521872
if (n_stream_cur != n_stream) {
@@ -1861,7 +1881,7 @@ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_i
18611881
continue;
18621882
}
18631883

1864-
const uint32_t strm = seq_id == -1 ? s : seq_to_stream.at(seq_id);
1884+
const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];
18651885

18661886
bool res = true;
18671887
res = res && state_read_meta(io, strm, cell_count, seq_id);

src/llama-kv-cache-unified.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,9 @@ class llama_kv_cache_unified : public llama_memory_i {
6060

6161
uint32_t head() const {
6262
GGML_ASSERT(idxs.size() == 1);
63+
GGML_ASSERT(!idxs[0].empty());
6364

64-
return idxs.at(0).at(0);
65+
return idxs[0][0];
6566
}
6667

6768
void resize(size_t n) {
@@ -71,8 +72,9 @@ class llama_kv_cache_unified : public llama_memory_i {
7172

7273
size_t size() const {
7374
GGML_ASSERT(idxs.size() == strm.size());
75+
GGML_ASSERT(!idxs.empty());
7476

75-
return idxs.at(0).size();
77+
return idxs[0].size();
7678
}
7779

7880
size_t n_stream() const {

0 commit comments

Comments
 (0)