jd-opensource
diff --git a/‎xllm/core/common/global_flags.cpp‎
Lines changed: 6 additions & 0 deletions b/‎xllm/core/common/global_flags.cpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎xllm/core/common/global_flags.h‎
Lines changed: 4 additions & 0 deletions b/‎xllm/core/common/global_flags.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎xllm/core/common/metrics.cpp‎
Lines changed: 8 additions & 0 deletions b/‎xllm/core/common/metrics.cpp‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎xllm/core/common/metrics.h‎
Lines changed: 4 additions & 0 deletions b/‎xllm/core/common/metrics.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎xllm/core/common/options.h‎
Lines changed: 4 additions & 0 deletions b/‎xllm/core/common/options.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎xllm/core/distributed_runtime/disagg_pd_service_impl.cpp‎
Lines changed: 4 additions & 1 deletion b/‎xllm/core/distributed_runtime/disagg_pd_service_impl.cpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎xllm/core/framework/block/block_manager.h‎
Lines changed: 8 additions & 1 deletion b/‎xllm/core/framework/block/block_manager.h‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎xllm/core/framework/block/block_manager_impl.cpp‎
Lines changed: 28 additions & 0 deletions b/‎xllm/core/framework/block/block_manager_impl.cpp‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎xllm/core/framework/block/block_manager_impl.h‎
Lines changed: 7 additions & 0 deletions b/‎xllm/core/framework/block/block_manager_impl.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎xllm/core/framework/block/block_manager_pool.cpp‎
Lines changed: 10 additions & 0 deletions b/‎xllm/core/framework/block/block_manager_pool.cpp‎
Lines changed: 10 additions & 0 deletions
@@ -199,3 +199,9 @@ DEFINE_string(etcd_addr, "", "etcd adderss for save instance meta info");
 DEFINE_bool(enable_service_routing, false, "whether to use etcd.");
 
 DEFINE_int32(heart_beat_interval, 3, "heart beat interval");
+
+DEFINE_string(priority_strategy, "FCFS", "priority strategy for requests");
+
+DEFINE_bool(enable_on_preempt_off,
+            true,
+            "whether enable online preempt offline");
@@ -126,3 +126,7 @@ DECLARE_int32(heart_beat_interval);
 DECLARE_int32(chunked_match_frequency);
 
 DECLARE_bool(use_zero_evict);
+
+DECLARE_string(priority_strategy);
+
+DECLARE_bool(enable_on_preempt_off);
@@ -88,6 +88,14 @@ DEFINE_GAUGE(num_running_requests, "Number of running requests in scheduler");
 DEFINE_GAUGE(num_waiting_requests, "Number of waiting requests in scheduler");
 DEFINE_GAUGE(num_preempted_requests,
              "Number of preempted requests in scheduler");
+DEFINE_GAUGE(num_offd_preempt_off_requests,
+             "Number of offline decode preempt offline requests in scheduler");
+DEFINE_GAUGE(num_ond_preempt_on_requests,
+             "Number of online decode preempt online requests in scheduler");
+DEFINE_GAUGE(num_onp_preempt_off_requests,
+             "Number of online prefill preempt offline requests in scheduler");
+DEFINE_GAUGE(num_ond_preempt_off_requests,
+             "Number of online decode preempt offline requests in scheduler");
 
 DEFINE_GAUGE(num_running_sequences, "Number of running sequences");
 
 
@@ -149,6 +149,10 @@ DECLARE_GAUGE(num_pending_requests);
 DECLARE_GAUGE(num_running_requests);
 DECLARE_GAUGE(num_waiting_requests);
 DECLARE_GAUGE(num_preempted_requests);
+DECLARE_GAUGE(num_offd_preempt_off_requests);
+DECLARE_GAUGE(num_ond_preempt_on_requests);
+DECLARE_GAUGE(num_onp_preempt_off_requests);
+DECLARE_GAUGE(num_ond_preempt_off_requests);
 DECLARE_GAUGE(num_running_sequences);
 DECLARE_GAUGE(kv_cache_utilization_perc);
 DECLARE_GAUGE(num_blocks_in_prefix_cache);
 
@@ -113,6 +113,10 @@ class Options {
   PROPERTY(bool, enable_service_routing) = false;
 
   PROPERTY(std::optional<std::string>, tool_call_parser);
+
+  PROPERTY(std::string, priority_strategy) = "FCFS";
+
+  PROPERTY(bool, enable_on_preempt_off) = true;
 };
 
 }  // namespace xllm
@@ -94,7 +94,10 @@ std::shared_ptr<Request> DisaggPDServiceImpl::generate_request(
                                                req.x_request_id(),
                                                req.x_request_time(),
                                                std::move(req_state),
-                                               req.service_req_id());
+                                               req.service_req_id(),
+                                               req.offline(),
+                                               req.slo_ms(),
+                                               req.priority());
 
   // add one sequence, rest will be added by scheduler
   return new_request;
 
@@ -32,10 +32,13 @@ limitations under the License.
 #include "common/metrics.h"
 #include "common/types.h"
 #include "framework/prefix_cache/prefix_cache.h"
+#include "framework/request/request.h"
+#include "framework/request/sequence.h"
+#include "scheduler/decode_priority_queue.h"
 #include "util/timer.h"
 
 namespace xllm {
-
+// class DecodePriorityQueue;
 class BlockManager {
  public:
   struct Options {
@@ -59,6 +62,10 @@ class BlockManager {
 
   virtual void cache(const Slice<int32_t>& token_ids,
                      const Slice<Block>& blocks) = 0;
+  virtual bool check_if_enough_to_evict(
+      DecodePriorityQueue* running_queue_to_evict,
+      Sequence* prefill_sequence,
+      size_t& num_request_to_evict) = 0;
 
   // get merged all dp rank KVCacheEvent
   virtual void get_merged_kvcache_event(KvCacheEvent* event) const = 0;
 
@@ -30,6 +30,7 @@ BlockManagerImpl::BlockManagerImpl(const Options& options)
   }
 
   size_t total_blocks = options_.num_blocks();
+  block_size_ = options_.block_size();
   num_free_blocks_ = total_blocks;
   free_blocks_.reserve(total_blocks);
   for (int32_t i = 0; i < total_blocks; ++i) {
@@ -73,6 +74,33 @@ void BlockManagerImpl::deallocate(const Slice<Block>& blocks) {
   }
 }
 
+bool BlockManagerImpl::check_if_enough_to_evict(
+    DecodePriorityQueue* running_queue_to_evict,
+    Sequence* prefill_sequence,
+    size_t& num_request_to_evict) {
+  // check if it's enough when we evict this requests queue
+
+  const size_t num_blocks_needed =
+      (prefill_sequence->num_tokens() + block_size_ - 1) / block_size_;
+  size_t num_blocks_can_evict = 0;
+  // count the number of blocks can be preempted
+  for (auto it = running_queue_to_evict->rbegin();
+       it != running_queue_to_evict->rend();
+       ++it) {
+    std::shared_ptr<Request> request_to_preempt = *it;
+    num_request_to_evict++;
+    // count the number of blocks belong to the request
+    for (const auto& seq : request_to_preempt->sequences()) {
+      num_blocks_can_evict += seq->kv_state().num_kv_blocks();
+    }
+    if ((num_blocks_needed <= num_blocks_can_evict) ||
+        has_enough_blocks(num_blocks_needed - num_blocks_can_evict)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool BlockManagerImpl::has_enough_blocks(uint32_t num_blocks) {
   if (num_blocks <= num_free_blocks_) {
     return true;
 
@@ -46,6 +46,10 @@ class BlockManagerImpl : public BlockManager {
 
   void get_merged_kvcache_event(KvCacheEvent* event) const override;
 
+  bool check_if_enough_to_evict(DecodePriorityQueue* running_queue_to_evict,
+                                Sequence* prefill_sequence,
+                                size_t& num_request_to_evict) override;
+
   size_t num_blocks_in_prefix_cache() const override {
     if (options_.enable_prefix_cache()) {
       CHECK(prefix_cache_);
@@ -99,6 +103,9 @@ class BlockManagerImpl : public BlockManager {
   // free block count
   size_t num_free_blocks_ = 0;
 
+  // block size
+  size_t block_size_ = 0;
+
   // free block list
   std::vector<int32_t> free_blocks_;
 };
 
@@ -92,6 +92,16 @@ bool BlockManagerPool::allocate(Sequence* sequence) {
   return allocate(sequence, sequence->num_tokens());
 }
 
+bool BlockManagerPool::check_if_enough_to_evict(
+    DecodePriorityQueue* running_queue_to_evict,
+    Sequence* prefill_sequence,
+    size_t& num_request_to_evict) {
+  DCHECK(prefill_sequence != nullptr);
+  int32_t dp_rank = prefill_sequence->dp_rank();
+  return block_managers_[dp_rank]->check_if_enough_to_evict(
+      running_queue_to_evict, prefill_sequence, num_request_to_evict);
+}
+
 bool BlockManagerPool::allocate(std::vector<Sequence*>& sequences) {
   for (auto* sequence : sequences) {
     DCHECK(sequence != nullptr);