refactor: refactor the priority and on/offline code.

weizhehuang0827 · weizhehuang0827 · commit f1ebe834c9fc · 2025-08-30T13:18:25.000+08:00
diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp
@@ -202,6 +202,6 @@ DEFINE_int32(heart_beat_interval, 3, "heart beat interval");
 
 DEFINE_string(priority_strategy, "FCFS", "priority strategy for requests");
 
-DEFINE_bool(enable_on_preempt_off,
+DEFINE_bool(enable_online_preempt_offline,
             true,
             "whether enable online preempt offline");
diff --git a/xllm/core/common/global_flags.h b/xllm/core/common/global_flags.h
@@ -129,4 +129,4 @@ DECLARE_bool(use_zero_evict);
 
 DECLARE_string(priority_strategy);
 
-DECLARE_bool(enable_on_preempt_off);
+DECLARE_bool(enable_online_preempt_offline);
diff --git a/xllm/core/common/metrics.cpp b/xllm/core/common/metrics.cpp
@@ -88,13 +88,13 @@ DEFINE_GAUGE(num_running_requests, "Number of running requests in scheduler");
 DEFINE_GAUGE(num_waiting_requests, "Number of waiting requests in scheduler");
 DEFINE_GAUGE(num_preempted_requests,
              "Number of preempted requests in scheduler");
-DEFINE_GAUGE(num_offd_preempt_off_requests,
+DEFINE_GAUGE(num_offline_decode_preempt_offline_requests,
              "Number of offline decode preempt offline requests in scheduler");
-DEFINE_GAUGE(num_ond_preempt_on_requests,
+DEFINE_GAUGE(num_online_decode_preempt_online_requests,
              "Number of online decode preempt online requests in scheduler");
-DEFINE_GAUGE(num_onp_preempt_off_requests,
+DEFINE_GAUGE(num_online_prefill_preempt_offline_requests,
              "Number of online prefill preempt offline requests in scheduler");
-DEFINE_GAUGE(num_ond_preempt_off_requests,
+DEFINE_GAUGE(num_online_decode_preempt_offline_requests,
              "Number of online decode preempt offline requests in scheduler");
 
 DEFINE_GAUGE(num_running_sequences, "Number of running sequences");
diff --git a/xllm/core/common/metrics.h b/xllm/core/common/metrics.h
@@ -149,10 +149,10 @@ DECLARE_GAUGE(num_pending_requests);
 DECLARE_GAUGE(num_running_requests);
 DECLARE_GAUGE(num_waiting_requests);
 DECLARE_GAUGE(num_preempted_requests);
-DECLARE_GAUGE(num_offd_preempt_off_requests);
-DECLARE_GAUGE(num_ond_preempt_on_requests);
-DECLARE_GAUGE(num_onp_preempt_off_requests);
-DECLARE_GAUGE(num_ond_preempt_off_requests);
+DECLARE_GAUGE(num_offline_decode_preempt_offline_requests);
+DECLARE_GAUGE(num_online_decode_preempt_online_requests);
+DECLARE_GAUGE(num_online_prefill_preempt_offline_requests);
+DECLARE_GAUGE(num_online_decode_preempt_offline_requests);
 DECLARE_GAUGE(num_running_sequences);
 DECLARE_GAUGE(kv_cache_utilization_perc);
 DECLARE_GAUGE(num_blocks_in_prefix_cache);
diff --git a/xllm/core/common/options.h b/xllm/core/common/options.h
@@ -116,7 +116,7 @@ class Options {
 
   PROPERTY(std::string, priority_strategy) = "FCFS";
 
-  PROPERTY(bool, enable_on_preempt_off) = true;
+  PROPERTY(bool, enable_online_preempt_offline) = true;
 };
 
 }  // namespace xllm
diff --git a/xllm/core/distributed_runtime/disagg_pd_service_impl.cpp b/xllm/core/distributed_runtime/disagg_pd_service_impl.cpp
@@ -90,14 +90,15 @@ std::shared_ptr<Request> DisaggPDServiceImpl::generate_request(
                          output_callback,
                          batch_output_callback);
 
-  auto new_request = std::make_shared<Request>(req.req_id(),
-                                               req.x_request_id(),
-                                               req.x_request_time(),
-                                               std::move(req_state),
-                                               req.service_req_id(),
-                                               req.offline(),
-                                               req.slo_ms(),
-                                               req.priority());
+  auto new_request = std::make_shared<Request>(
+      req.req_id(),
+      req.x_request_id(),
+      req.x_request_time(),
+      std::move(req_state),
+      req.service_req_id(),
+      req.offline(),
+      req.slo_ms(),
+      static_cast<xllm::RequestPriority>(req.priority()));
 
   // add one sequence, rest will be added by scheduler
   return new_request;
diff --git a/xllm/core/framework/block/block_manager.h b/xllm/core/framework/block/block_manager.h
@@ -62,10 +62,6 @@ class BlockManager {
 
   virtual void cache(const Slice<int32_t>& token_ids,
                      const Slice<Block>& blocks) = 0;
-  virtual bool check_if_enough_to_evict(
-      DecodePriorityQueue* running_queue_to_evict,
-      Sequence* prefill_sequence,
-      size_t& num_request_to_evict) = 0;
 
   // get merged all dp rank KVCacheEvent
   virtual void get_merged_kvcache_event(KvCacheEvent* event) const = 0;
diff --git a/xllm/core/framework/block/block_manager_impl.cpp b/xllm/core/framework/block/block_manager_impl.cpp
@@ -74,33 +74,6 @@ void BlockManagerImpl::deallocate(const Slice<Block>& blocks) {
   }
 }
 
-bool BlockManagerImpl::check_if_enough_to_evict(
-    DecodePriorityQueue* running_queue_to_evict,
-    Sequence* prefill_sequence,
-    size_t& num_request_to_evict) {
-  // check if it's enough when we evict this requests queue
-
-  const size_t num_blocks_needed =
-      (prefill_sequence->num_tokens() + block_size_ - 1) / block_size_;
-  size_t num_blocks_can_evict = 0;
-  // count the number of blocks can be preempted
-  for (auto it = running_queue_to_evict->rbegin();
-       it != running_queue_to_evict->rend();
-       ++it) {
-    std::shared_ptr<Request> request_to_preempt = *it;
-    num_request_to_evict++;
-    // count the number of blocks belong to the request
-    for (const auto& seq : request_to_preempt->sequences()) {
-      num_blocks_can_evict += seq->kv_state().num_kv_blocks();
-    }
-    if ((num_blocks_needed <= num_blocks_can_evict) ||
-        has_enough_blocks(num_blocks_needed - num_blocks_can_evict)) {
-      return true;
-    }
-  }
-  return false;
-}
-
 bool BlockManagerImpl::has_enough_blocks(uint32_t num_blocks) {
   if (num_blocks <= num_free_blocks_) {
     return true;
diff --git a/xllm/core/framework/block/block_manager_impl.h b/xllm/core/framework/block/block_manager_impl.h
@@ -46,10 +46,6 @@ class BlockManagerImpl : public BlockManager {
 
   void get_merged_kvcache_event(KvCacheEvent* event) const override;
 
-  bool check_if_enough_to_evict(DecodePriorityQueue* running_queue_to_evict,
-                                Sequence* prefill_sequence,
-                                size_t& num_request_to_evict) override;
-
   size_t num_blocks_in_prefix_cache() const override {
     if (options_.enable_prefix_cache()) {
       CHECK(prefix_cache_);
diff --git a/xllm/core/framework/block/block_manager_pool.cpp b/xllm/core/framework/block/block_manager_pool.cpp
@@ -92,16 +92,6 @@ bool BlockManagerPool::allocate(Sequence* sequence) {
   return allocate(sequence, sequence->num_tokens());
 }
 
-bool BlockManagerPool::check_if_enough_to_evict(
-    DecodePriorityQueue* running_queue_to_evict,
-    Sequence* prefill_sequence,
-    size_t& num_request_to_evict) {
-  DCHECK(prefill_sequence != nullptr);
-  int32_t dp_rank = prefill_sequence->dp_rank();
-  return block_managers_[dp_rank]->check_if_enough_to_evict(
-      running_queue_to_evict, prefill_sequence, num_request_to_evict);
-}
-
 bool BlockManagerPool::allocate(std::vector<Sequence*>& sequences) {
   for (auto* sequence : sequences) {
     DCHECK(sequence != nullptr);
diff --git a/xllm/core/framework/block/block_manager_pool.h b/xllm/core/framework/block/block_manager_pool.h
@@ -48,10 +48,6 @@ class BlockManagerPool {
   void get_merged_kvcache_event(KvCacheEvent* event) const;
   float get_gpu_cache_usage_perc() const;
 
-  bool check_if_enough_to_evict(DecodePriorityQueue* running_queue_to_evict,
-                                Sequence* prefill_sequence,
-                                size_t& num_request_to_evict);
-
   std::vector<size_t> num_blocks_in_prefix_cache() const;
   std::vector<size_t> num_free_blocks() const;
   std::vector<size_t> num_used_blocks() const;
diff --git a/xllm/core/framework/block/concurrent_block_manager_impl.cpp b/xllm/core/framework/block/concurrent_block_manager_impl.cpp
@@ -43,15 +43,6 @@ void ConcurrentBlockManagerImpl::cache(const Slice<int32_t>& token_ids,
   BlockManagerImpl::cache(token_ids, blocks);
 }
 
-bool ConcurrentBlockManagerImpl::check_if_enough_to_evict(
-    DecodePriorityQueue* running_queue_to_evict,
-    Sequence* prefill_sequence,
-    size_t& num_request_to_evict) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  return BlockManagerImpl::check_if_enough_to_evict(
-      running_queue_to_evict, prefill_sequence, num_request_to_evict);
-}
-
 size_t ConcurrentBlockManagerImpl::num_blocks_in_prefix_cache() const {
   std::lock_guard<std::mutex> lock(mutex_);
   return BlockManagerImpl::num_blocks_in_prefix_cache();
diff --git a/xllm/core/framework/block/concurrent_block_manager_impl.h b/xllm/core/framework/block/concurrent_block_manager_impl.h
@@ -39,10 +39,6 @@ class ConcurrentBlockManagerImpl : public BlockManagerImpl {
   void cache(const Slice<int32_t>& token_ids,
              const Slice<Block>& blocks) override;
 
-  bool check_if_enough_to_evict(DecodePriorityQueue* running_queue_to_evict,
-                                Sequence* prefill_sequence,
-                                size_t& num_request_to_evict) override;
-
   // get the number of blocks in the prefix cache
   size_t num_blocks_in_prefix_cache() const override;
 
diff --git a/xllm/core/framework/request/request.cpp b/xllm/core/framework/request/request.cpp
@@ -36,7 +36,7 @@ Request::Request(const std::string& request_id,
                  const std::string& service_request_id,
                  bool offline,
                  int32_t slo_ms,
-                 xllm::proto::Priority priority)
+                 RequestPriority priority)
     : request_id_(request_id),
       service_request_id_(service_request_id),
       x_request_id_(x_request_id),
diff --git a/xllm/core/framework/request/request.h b/xllm/core/framework/request/request.h
@@ -31,6 +31,8 @@ limitations under the License.
 
 namespace xllm {
 
+enum class RequestPriority { DEFAULT = 0, HIGH = 1, NORMAL = 2, LOW = 3 };
+
 class Request {
  public:
   Request(const std::string& request_id,
@@ -40,7 +42,7 @@ class Request {
           const std::string& service_request_id = "",
           bool offline = false,
           int32_t slo_ms = 0,
-          xllm::proto::Priority priority = xllm::proto::Priority::NORMAL);
+          RequestPriority priority = RequestPriority::NORMAL);
 
   bool finished() const;
 
@@ -86,7 +88,7 @@ class Request {
 
   const bool offline() const { return offline_; }
   const int32_t slo_ms() const { return slo_ms_; }
-  const xllm::proto::Priority priority() const { return priority_; }
+  const RequestPriority priority() const { return priority_; }
 
   RequestState& state() { return state_; }
 
@@ -119,7 +121,7 @@ class Request {
 
   int32_t slo_ms_;
 
-  xllm::proto::Priority priority_;
+  RequestPriority priority_;
 
  private:
   void create_sequences_group();
diff --git a/xllm/core/framework/request/request_params.cpp b/xllm/core/framework/request/request_params.cpp
@@ -54,7 +54,7 @@ RequestParams::RequestParams(const proto::CompletionRequest& request,
     slo_ms = request.slo_ms();
   }
   if (request.has_priority()) {
-    priority = request.priority();
+    priority = static_cast<xllm::RequestPriority>(request.priority());
   }
 
   if (request.has_service_request_id()) {
@@ -203,7 +203,7 @@ void InitFromChatRequest(RequestParams& params, const ChatRequest& request) {
     params.slo_ms = request.slo_ms();
   }
   if (request.has_priority()) {
-    params.priority = request.priority();
+    params.priority = static_cast<xllm::RequestPriority>(request.priority());
   }
 
   if (request.has_service_request_id()) {
diff --git a/xllm/core/framework/request/request_params.h b/xllm/core/framework/request/request_params.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "core/common/types.h"
 #include "embedding.pb.h"
 #include "multimodal.pb.h"
+#include "request.h"
 #include "request_output.h"
 
 namespace xllm {
@@ -130,7 +131,7 @@ struct RequestParams {
 
   int32_t slo_ms = 0;
 
-  xllm::proto::Priority priority = xllm::proto::Priority::NORMAL;
+  RequestPriority priority = RequestPriority::NORMAL;
 };
 
 }  // namespace xllm
diff --git a/xllm/core/runtime/options.h b/xllm/core/runtime/options.h
@@ -124,7 +124,7 @@ struct Options {
 
   PROPERTY(std::string, priority_strategy) = "FCFS";
 
-  PROPERTY(bool, enable_on_preempt_off) = true;
+  PROPERTY(bool, enable_online_preempt_offline) = true;
 };
 
 }  // namespace runtime
diff --git a/xllm/core/scheduler/chunked_prefill_scheduler.cpp b/xllm/core/scheduler/chunked_prefill_scheduler.cpp
@@ -222,7 +222,7 @@ void ChunkedPrefillScheduler::handle_running_queue_requests(
       if (request_to_preempt.get() != request.get()) {
         ++num_preempted_requests;
         block_manager_pool_->deallocate(request_to_preempt.get());
-        running_queue_.->pop_back();
+        running_queue_->pop_back();
         // add preemptable request to waiting priority queue
         request_to_preempt->set_preempted();
         waiting_priority_queue_.push(request_to_preempt);
diff --git a/xllm/core/scheduler/continuous_scheduler.cpp b/xllm/core/scheduler/continuous_scheduler.cpp
diff --git a/xllm/core/scheduler/continuous_scheduler.h b/xllm/core/scheduler/continuous_scheduler.h
diff --git a/xllm/core/scheduler/continuous_scheduler_test.cpp b/xllm/core/scheduler/continuous_scheduler_test.cpp
diff --git a/xllm/core/scheduler/disagg_pd_scheduler.cpp b/xllm/core/scheduler/disagg_pd_scheduler.cpp
diff --git a/xllm/core/util/concurrent_queue.h b/xllm/core/util/concurrent_queue.h
diff --git a/xllm/xllm.cpp b/xllm/xllm.cpp

Original file line number	Diff line number	Diff line change
`@@ -129,4 +129,4 @@ DECLARE_bool(use_zero_evict);`
`129`	`129`
`130`	`130`	`DECLARE_string(priority_strategy);`
`131`	`131`
`132`		`-DECLARE_bool(enable_on_preempt_off);`
	`132`	`+DECLARE_bool(enable_online_preempt_offline);`