Skip to content

Commit d99fa64

Browse files
yq33victorliutongxuan
authored andcommitted
bugfix: fix prefix cache evict bug.
Signed-off-by: fangyuanhong1 <[email protected]>
1 parent a32aa5e commit d99fa64

8 files changed

+77
-69
lines changed

xllm/core/framework/prefix_cache/prefix_cache_hash_murmur3.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ size_t PrefixCacheHashMurmur3::evict(size_t n_blocks) {
207207
Node* iter_node = lru_lst_.get_first();
208208
std::vector<Murmur3Key> del_list;
209209
del_list.reserve(n_blocks);
210-
for (size_t i = 0; i < n_blocks; ++i) {
210+
for (size_t i = 0; i < n_blocks;) {
211211
if (lru_lst_.is_last(iter_node)) {
212212
break;
213213
}
@@ -233,6 +233,7 @@ size_t PrefixCacheHashMurmur3::evict(size_t n_blocks) {
233233
delete del_node;
234234
++evict_count;
235235
--num_blocks_;
236+
++i;
236237
}
237238
if (enable_service_routing_) {
238239
threadpool_.schedule([del_list = std::move(del_list), this]() {

xllm/core/framework/prefix_cache/prefix_cache_hash_sha256.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ size_t PrefixCacheHashSha256::evict(size_t n_blocks) {
163163

164164
size_t evict_count = 0;
165165
Node* iter_node = lru_lst_.get_first();
166-
for (; evict_count < n_blocks; ++evict_count) {
166+
for (; evict_count < n_blocks;) {
167167
if (lru_lst_.is_last(iter_node)) {
168168
break;
169169
}
@@ -185,6 +185,7 @@ size_t PrefixCacheHashSha256::evict(size_t n_blocks) {
185185
delete del_node;
186186

187187
--num_blocks_;
188+
++evict_count;
188189
}
189190

190191
return evict_count;

xllm/core/scheduler/chunked_prefill_scheduler.cpp

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ void ChunkedPrefillScheduler::handle_abnormal_request(
7777
<< "Running queue size is not 1, there maybe a bug of request "
7878
"preemption logic. running_queue_.size ="
7979
<< running_queue_.size();
80-
if (util::sum(block_manager_->num_used_blocks()) !=
80+
if (util::sum(block_manager_pool_->num_used_blocks()) !=
8181
request->total_num_blocks()) {
8282
// blocks_exhausted is true.
8383
// NOTE: consider dp > 1, here we need get all num blocks in use.
@@ -91,7 +91,7 @@ void ChunkedPrefillScheduler::handle_abnormal_request(
9191

9292
// request is too long, budget or memory no enough.
9393
running_queue_.pop_front();
94-
block_manager_->deallocate(request.get());
94+
block_manager_pool_->deallocate(request.get());
9595
response_processor_->process_failed_request(
9696
request,
9797
{StatusCode::RESOURCE_EXHAUSTED,
@@ -166,7 +166,7 @@ void ChunkedPrefillScheduler::handle_running_queue_requests(
166166
}
167167

168168
// if (sequence->if_cache_block_for_prefill()) {
169-
// block_manager_->cache(sequence.get());
169+
// block_manager_pool_->cache(sequence.get());
170170
// }
171171

172172
// the new request do chunked prefill
@@ -221,7 +221,7 @@ void ChunkedPrefillScheduler::handle_running_queue_requests(
221221

222222
if (request_to_preempt.get() != request.get()) {
223223
++num_preempted_requests;
224-
block_manager_->deallocate(request_to_preempt.get());
224+
block_manager_pool_->deallocate(request_to_preempt.get());
225225
running_queue_.pop_back();
226226
// add preemptable request to waiting priority queue
227227
request_to_preempt->set_preempted();
@@ -260,7 +260,7 @@ void ChunkedPrefillScheduler::handle_prefill_requests(
260260
remaining_seq_budget > 0) {
261261
std::shared_ptr<Request> request(waiting_priority_queue_.top());
262262
if (request->finished() || request->cancelled()) {
263-
block_manager_->deallocate(request.get());
263+
block_manager_pool_->deallocate(request.get());
264264
// release the ownership of the request
265265
finished_requests.emplace_back(request);
266266
// remove the request from the priority queue
@@ -304,7 +304,7 @@ void ChunkedPrefillScheduler::handle_prefill_requests(
304304
num_tokens,
305305
&current_step_handle_tokens)) {
306306
// release shared blocks
307-
block_manager_->deallocate(prefill_sequence.get());
307+
block_manager_pool_->deallocate(prefill_sequence.get());
308308
can_schedule = false;
309309
blocks_exhausted = true;
310310
break;
@@ -318,7 +318,7 @@ void ChunkedPrefillScheduler::handle_prefill_requests(
318318
if (!can_schedule) {
319319
for (auto& seq : prefill_sequences) {
320320
// release shared blocks
321-
block_manager_->deallocate(seq);
321+
block_manager_pool_->deallocate(seq);
322322
}
323323
break;
324324
}
@@ -343,13 +343,14 @@ void ChunkedPrefillScheduler::handle_prefill_requests(
343343
}
344344

345345
if (running_sequences_.empty() && !waiting_priority_queue_.empty() &&
346-
running_queue_.empty() && block_manager_->kv_cache_utilization() == 0) {
346+
running_queue_.empty() &&
347+
block_manager_pool_->kv_cache_utilization() == 0) {
347348
LOG(ERROR) << "Request prompt is too long, no enough memory to schedule "
348349
"a single sequence";
349350
// no enough memory to schedule single sequence, just finish the request
350351
std::shared_ptr<Request> request(waiting_priority_queue_.top());
351352
waiting_priority_queue_.pop();
352-
block_manager_->deallocate(request.get());
353+
block_manager_pool_->deallocate(request.get());
353354
response_processor_->process_failed_request(
354355
request,
355356
{StatusCode::RESOURCE_EXHAUSTED,
@@ -425,7 +426,7 @@ std::vector<Batch> ChunkedPrefillScheduler::prepare_batch() {
425426
std::shared_ptr<Request> request = *it;
426427
request->update_connection_status();
427428
if (request->finished() || request->cancelled()) {
428-
block_manager_->deallocate(request.get());
429+
block_manager_pool_->deallocate(request.get());
429430
// release the ownership of the request
430431
finished_requests.emplace_back(request);
431432
// finished request is set to nullptr
@@ -451,13 +452,13 @@ std::vector<Batch> ChunkedPrefillScheduler::prepare_batch() {
451452
// check if the request can be expanded
452453
if (request->expand_sequences()) {
453454
// cache the blocks to share among the sequences
454-
block_manager_->cache(request->sequences()[0].get());
455+
block_manager_pool_->cache(request->sequences()[0].get());
455456
}
456457

457458
// release blocks for finished sequences here
458459
for (auto& sequence : request->sequences()) {
459460
if (sequence->finished()) {
460-
block_manager_->deallocate(sequence.get());
461+
block_manager_pool_->deallocate(sequence.get());
461462
}
462463
}
463464

@@ -555,11 +556,12 @@ std::vector<Batch> ChunkedPrefillScheduler::prepare_batch() {
555556

556557
GAUGE_SET(num_running_sequences, running_sequences_.size());
557558

558-
GAUGE_SET(kv_cache_utilization_perc, block_manager_->kv_cache_utilization());
559+
GAUGE_SET(kv_cache_utilization_perc,
560+
block_manager_pool_->kv_cache_utilization());
559561
GAUGE_SET(num_blocks_in_prefix_cache,
560-
util::min(block_manager_->num_blocks_in_prefix_cache()));
561-
GAUGE_SET(num_free_blocks, util::max(block_manager_->num_free_blocks()));
562-
GAUGE_SET(num_used_blocks, util::min(block_manager_->num_used_blocks()));
562+
util::min(block_manager_pool_->num_blocks_in_prefix_cache()));
563+
GAUGE_SET(num_free_blocks, util::max(block_manager_pool_->num_free_blocks()));
564+
GAUGE_SET(num_used_blocks, util::min(block_manager_pool_->num_used_blocks()));
563565

564566
return batches;
565567
}
@@ -573,7 +575,7 @@ bool ChunkedPrefillScheduler::allocate_blocks_for(
573575

574576
if (sequence->kv_state().num_kv_blocks() == 0) {
575577
// allocate shared blocks
576-
block_manager_->allocate_shared(sequence);
578+
block_manager_pool_->allocate_shared(sequence);
577579
}
578580
allocate_shared_blocks_for(sequence);
579581

@@ -600,13 +602,13 @@ bool ChunkedPrefillScheduler::allocate_blocks_for(
600602
// number of tokens and the number of tokens already processed
601603
*current_step_handle_tokens = max_handle_num_tokens - kv_cache_tokens_num;
602604
// allocate blocks for the sequence
603-
return block_manager_->allocate(sequence, max_handle_num_tokens);
605+
return block_manager_pool_->allocate(sequence, max_handle_num_tokens);
604606
}
605607

606608
void ChunkedPrefillScheduler::allocate_shared_blocks_for(Sequence* sequence) {
607609
if (sequence->kv_state().num_kv_blocks() == 0) {
608610
// allocate shared blocks
609-
block_manager_->allocate_shared(sequence);
611+
block_manager_pool_->allocate_shared(sequence);
610612
return;
611613
}
612614
if (sequence->is_prefill_stage()) {
@@ -616,7 +618,7 @@ void ChunkedPrefillScheduler::allocate_shared_blocks_for(Sequence* sequence) {
616618
(sequence->num_tokens() + max_tokens_per_chunk_for_prefill - 1) /
617619
max_tokens_per_chunk_for_prefill;
618620
if (total_chunked_size < FLAGS_chunked_match_frequency) {
619-
block_manager_->allocate_shared(sequence);
621+
block_manager_pool_->allocate_shared(sequence);
620622
return;
621623
}
622624
size_t prefix_cache_interval =
@@ -626,7 +628,7 @@ void ChunkedPrefillScheduler::allocate_shared_blocks_for(Sequence* sequence) {
626628
max_tokens_per_chunk_for_prefill;
627629
if (cur_chunked_index % prefix_cache_interval == 0) {
628630
// allocate shared blocks
629-
block_manager_->allocate_shared(sequence);
631+
block_manager_pool_->allocate_shared(sequence);
630632
}
631633
}
632634
}

xllm/core/scheduler/continuous_scheduler.cpp

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ constexpr size_t kRequestQueueSize = 100000;
3939
ContinuousScheduler::ContinuousScheduler(Engine* engine, const Options& options)
4040
: options_(options), engine_(engine), request_queue_(kRequestQueueSize) {
4141
CHECK(engine_ != nullptr);
42-
block_manager_ = engine_->block_manager_pool();
43-
CHECK(block_manager_ != nullptr);
44-
enable_prefix_cache_ = block_manager_->options().enable_prefix_cache();
42+
block_manager_pool_ = engine_->block_manager_pool();
43+
CHECK(block_manager_pool_ != nullptr);
44+
enable_prefix_cache_ = block_manager_pool_->options().enable_prefix_cache();
4545

4646
last_batch_.resize(options_.dp_size());
4747

@@ -85,11 +85,11 @@ void ContinuousScheduler::handle_prefill_requests(
8585
// they may contian many sequences, so we should check here.
8686
while (!waiting_priority_queue_.empty() && remaining_seq_budget > 0 &&
8787
remaining_token_budget > 0 &&
88-
block_manager_->kv_cache_utilization() <
88+
block_manager_pool_->kv_cache_utilization() <
8989
FLAGS_prefill_scheduling_memory_usage_threshold) {
9090
std::shared_ptr<Request> request(waiting_priority_queue_.top());
9191
if (request->finished() || request->cancelled()) {
92-
block_manager_->deallocate(request.get());
92+
block_manager_pool_->deallocate(request.get());
9393
// release the ownership of the request
9494
finished_requests.emplace_back(request);
9595
// remove the request from the priority queue
@@ -124,8 +124,8 @@ void ContinuousScheduler::handle_prefill_requests(
124124
break;
125125
}
126126

127-
if (!block_manager_->allocate(prefill_sequence.get())) {
128-
block_manager_->deallocate(prefill_sequence.get());
127+
if (!block_manager_pool_->allocate(prefill_sequence.get())) {
128+
block_manager_pool_->deallocate(prefill_sequence.get());
129129
can_schedule = false;
130130
break;
131131
}
@@ -139,7 +139,7 @@ void ContinuousScheduler::handle_prefill_requests(
139139
if (!can_schedule) {
140140
for (auto& seq : prefill_sequences) {
141141
// release shared blocks
142-
block_manager_->deallocate(seq);
142+
block_manager_pool_->deallocate(seq);
143143
}
144144
break;
145145
}
@@ -161,13 +161,14 @@ void ContinuousScheduler::handle_prefill_requests(
161161
}
162162

163163
if (running_sequences_.empty() && !waiting_priority_queue_.empty() &&
164-
running_queue_.empty() && block_manager_->kv_cache_utilization() == 0) {
164+
running_queue_.empty() &&
165+
block_manager_pool_->kv_cache_utilization() == 0) {
165166
LOG(ERROR) << "Request prompt is too long, no enough memory to schedule "
166167
"a single sequence.";
167168
// no enough memory to schedule single sequence, just finish the request
168169
std::shared_ptr<Request> request(waiting_priority_queue_.top());
169170
waiting_priority_queue_.pop();
170-
block_manager_->deallocate(request.get());
171+
block_manager_pool_->deallocate(request.get());
171172
response_processor_->process_failed_request(
172173
request,
173174
{StatusCode::RESOURCE_EXHAUSTED,
@@ -224,13 +225,13 @@ void ContinuousScheduler::handle_decode_requests(
224225
size_t updated_num_tokens =
225226
sequence->num_tokens() + options_.num_speculative_tokens() + 1;
226227
// no blocks left
227-
if (!block_manager_->allocate(sequence.get(), updated_num_tokens)) {
228+
if (!block_manager_pool_->allocate(sequence.get(), updated_num_tokens)) {
228229
has_enough_blocks = false;
229230
break;
230231
}
231232

232233
if (sequence->if_cache_block_for_prefill()) {
233-
block_manager_->cache(sequence.get());
234+
block_manager_pool_->cache(sequence.get());
234235
}
235236

236237
// update the allocated tokens for the sequence
@@ -279,7 +280,7 @@ void ContinuousScheduler::handle_decode_requests(
279280

280281
if (request_to_preempt.get() != request.get()) {
281282
++num_preempted_requests;
282-
block_manager_->deallocate(request_to_preempt.get());
283+
block_manager_pool_->deallocate(request_to_preempt.get());
283284
running_queue_.pop_back();
284285
// add preemptable request to waiting priority queue
285286
request_to_preempt->set_preempted();
@@ -339,7 +340,7 @@ void ContinuousScheduler::handle_abnormal_request(
339340
<< "Running queue size is not 1, there maybe a bug of request "
340341
"preemption logic. running_queue_.size ="
341342
<< running_queue_.size();
342-
if (util::sum(block_manager_->num_used_blocks()) !=
343+
if (util::sum(block_manager_pool_->num_used_blocks()) !=
343344
request->total_num_blocks()) {
344345
// blocks_exhausted is true.
345346
// NOTE: consider dp > 1, here we need get all num blocks in use.
@@ -353,7 +354,7 @@ void ContinuousScheduler::handle_abnormal_request(
353354

354355
// request is too long, budget or memory no enough.
355356
running_queue_.pop_front();
356-
block_manager_->deallocate(request.get());
357+
block_manager_pool_->deallocate(request.get());
357358
response_processor_->process_failed_request(
358359
request,
359360
{StatusCode::RESOURCE_EXHAUSTED,
@@ -384,13 +385,13 @@ void ContinuousScheduler::handle_running_requests(
384385
// check if the request can be expanded
385386
if (request->expand_sequences()) {
386387
// cache the blocks to share among the sequences
387-
block_manager_->cache(request->sequences()[0].get());
388+
block_manager_pool_->cache(request->sequences()[0].get());
388389
}
389390

390391
// release blocks for finished sequences here
391392
for (auto& sequence : request->sequences()) {
392393
if (sequence->finished()) {
393-
block_manager_->deallocate(sequence.get());
394+
block_manager_pool_->deallocate(sequence.get());
394395
}
395396
}
396397
}
@@ -428,7 +429,7 @@ std::vector<Batch> ContinuousScheduler::prepare_batch() {
428429
std::shared_ptr<Request> request = *it;
429430
request->update_connection_status();
430431
if (request->finished() || request->cancelled()) {
431-
block_manager_->deallocate(request.get());
432+
block_manager_pool_->deallocate(request.get());
432433
// release the ownership of the request
433434
finished_requests.emplace_back(request);
434435
// finished request is set to nullptr
@@ -516,11 +517,12 @@ std::vector<Batch> ContinuousScheduler::prepare_batch() {
516517

517518
GAUGE_SET(num_running_sequences, running_sequences_.size());
518519

519-
GAUGE_SET(kv_cache_utilization_perc, block_manager_->kv_cache_utilization());
520+
GAUGE_SET(kv_cache_utilization_perc,
521+
block_manager_pool_->kv_cache_utilization());
520522
GAUGE_SET(num_blocks_in_prefix_cache,
521-
util::min(block_manager_->num_blocks_in_prefix_cache()));
522-
GAUGE_SET(num_free_blocks, util::max(block_manager_->num_free_blocks()));
523-
GAUGE_SET(num_used_blocks, util::min(block_manager_->num_used_blocks()));
523+
util::min(block_manager_pool_->num_blocks_in_prefix_cache()));
524+
GAUGE_SET(num_free_blocks, util::max(block_manager_pool_->num_free_blocks()));
525+
GAUGE_SET(num_used_blocks, util::min(block_manager_pool_->num_used_blocks()));
524526
return batches;
525527
}
526528

@@ -656,8 +658,8 @@ void ContinuousScheduler::process_batch_output(bool enable_schedule_overlap) {
656658
get_num_occupied_slots(to_be_processed_sequences);
657659
std::vector<int64_t> active_activation_size_in_bytes =
658660
get_active_activation_in_bytes();
659-
int64_t num_total_slots = block_manager_->options().num_blocks() *
660-
block_manager_->options().block_size();
661+
int64_t num_total_slots = block_manager_pool_->options().num_blocks() *
662+
block_manager_pool_->options().block_size();
661663
for (int32_t dp_rank = 0; dp_rank < options_.dp_size(); ++dp_rank) {
662664
double occupied_slots_ratio =
663665
static_cast<double>(num_occupied_slots[dp_rank]) / num_total_slots;
@@ -712,16 +714,16 @@ void ContinuousScheduler::process_batch_output(bool enable_schedule_overlap) {
712714

713715
std::vector<Block> ContinuousScheduler::allocate_blocks_for(size_t token_num,
714716
int32_t& dp_rank) {
715-
return block_manager_->allocate(token_num, dp_rank);
717+
return block_manager_pool_->allocate(token_num, dp_rank);
716718
}
717719

718720
std::vector<int64_t> ContinuousScheduler::get_num_occupied_slots(
719721
std::vector<Sequence*>& sequences) const {
720722
std::vector<int64_t> num_occupied_slots(options_.dp_size());
721723
std::vector<int64_t> num_unfilled_blocks(options_.dp_size());
722-
std::vector<size_t> num_used_blocks = block_manager_->num_used_blocks();
724+
std::vector<size_t> num_used_blocks = block_manager_pool_->num_used_blocks();
723725

724-
const int block_size = block_manager_->options().block_size();
726+
const int block_size = block_manager_pool_->options().block_size();
725727

726728
for (auto& sequence : sequences) {
727729
const int32_t dp_rank = sequence->dp_rank();

xllm/core/scheduler/continuous_scheduler.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ class ContinuousScheduler : public Scheduler {
126126
Engine* engine_;
127127

128128
// the block manager to manage the cache blocks
129-
BlockManagerPool* block_manager_;
129+
BlockManagerPool* block_manager_pool_;
130130

131131
// a thread safe queue of requests, bounded by kRequestQueueSize
132132
// the schedule owns the requests and manages their lifetimes.

0 commit comments

Comments
 (0)