diff --git a/src/rate_limiter.cc b/src/rate_limiter.cc index e89c94cc2..47917fbd2 100644 --- a/src/rate_limiter.cc +++ b/src/rate_limiter.cc @@ -578,10 +578,51 @@ RateLimiter::AttemptAllocation() { std::lock_guard lk(staged_instances_mtx_); if (!staged_instances_.empty()) { - ModelInstanceContext* instance = staged_instances_.top(); - if (resource_manager_->AllocateResources(instance)) { + // Greedy allocation: Try to allocate ANY instance that has sufficient + // resources, not just the top priority one. This prevents deadlocks + // when BLS calls require resources and the child model stucks due to + // head‑of‑line blocking + + // First, try the top instance (highest priority) + ModelInstanceContext* top_instance = staged_instances_.top(); + if (resource_manager_->AllocateResources(top_instance)) { staged_instances_.pop(); - instance->Allocate(); + top_instance->Allocate(); + // After allocating one instance, recursively try to allocate more + // in case resources were freed up + AttemptAllocation(); + return; + } + + // If top instance can't be allocated, try other staged instances + // to find one that can be allocated (greedy approach) + std::vector temp_instances; + temp_instances.push_back(top_instance); + staged_instances_.pop(); + + bool allocated = false; + while (!staged_instances_.empty() && !allocated) { + ModelInstanceContext* instance = staged_instances_.top(); + staged_instances_.pop(); + + if (resource_manager_->AllocateResources(instance)) { + // Found an instance that can be allocated + instance->Allocate(); + allocated = true; + } else { + // Can't allocate this instance, save it for later + temp_instances.push_back(instance); + } + } + + // Restore unallocated instances back to the priority queue + for (auto* instance : temp_instances) { + staged_instances_.push(instance); + } + + // If we allocated an instance, try again in case more can be allocated + if (allocated) { + AttemptAllocation(); } } }