jd-opensource
diff --git a/‎docs/en/features/eplb.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/en/features/eplb.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/zh/features/eplb.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/zh/features/eplb.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎xllm/core/common/global_flags.cpp‎
Lines changed: 5 additions & 1 deletion b/‎xllm/core/common/global_flags.cpp‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎xllm/core/common/global_flags.h‎
Lines changed: 3 additions & 1 deletion b/‎xllm/core/common/global_flags.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎xllm/core/common/options.h‎
Lines changed: 3 additions & 1 deletion b/‎xllm/core/common/options.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎xllm/core/common/types.h‎
Lines changed: 8 additions & 0 deletions b/‎xllm/core/common/types.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎xllm/core/framework/eplb/eplb_executor.h‎
Lines changed: 8 additions & 0 deletions b/‎xllm/core/framework/eplb/eplb_executor.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎xllm/core/framework/eplb/eplb_manager.cpp‎
Lines changed: 12 additions & 12 deletions b/‎xllm/core/framework/eplb/eplb_manager.cpp‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎xllm/core/framework/eplb/eplb_manager.h‎
Lines changed: 15 additions & 5 deletions b/‎xllm/core/framework/eplb/eplb_manager.h‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎xllm/core/framework/eplb/eplb_policy.cpp‎
Lines changed: 5 additions & 5 deletions b/‎xllm/core/framework/eplb/eplb_policy.cpp‎
Lines changed: 5 additions & 5 deletions
@@ -23,8 +23,8 @@ Simply add the following gflag parameters when launching xLLM:
 
 - xLLM provides the gflag parameter `enable_eplb` (default: false). Set to true in the xLLM service startup script to enable dynamic expert load balancing.
 - `expert_parallel_degree` and `ep_size` are MoE-related parameters. `expert_parallel_degree` should be set to `2`, and `ep_size` must match the actual number of NPU/GPU devices. See [moe_params](./moe_params.md)
-- `eplb_update_rate` sets the expert distribution update interval in seconds (default: 1000).
+- `eplb_update_interval` sets the expert distribution update interval in seconds (default: 1000).
 - The expert distribution update uses a layer-by-layer mechanism based on expert load. When the similarity between consecutive loads for a layer is below `eplb_update_threshold`, that layer is updated (default: 1, range: 0-1).
 
 ```bash
---enable_eplb=true --expert_parallel_degree=2 --ep_size=16 --eplb_update_rate=2000 --eplb_update_threshold=0.9
+--enable_eplb=true --expert_parallel_degree=2 --ep_size=16 --eplb_update_interval=2000 --eplb_update_threshold=0.9
@@ -18,14 +18,14 @@ xLLM eplb功能主要通过以下三个模块实现：
 
 - xLLM中提供了gflags参数`enable_eplb`，默认false，如需开启动态专家负载均衡，在xLLM的服务启动脚本中设置为true即可。
 - `expert_parallel_degree`与`ep_size`为moe相关参数，`expert_parallel_degree`需要设置为`2`，`ep_size`要与实际NPU/GPU卡个数保持一致。参考 [moe_params](./moe_params.md)
-- `eplb_update_rate`为专家分布更新时间间隔，单位为妙，默认值为1000.
-- 专家分布更新采用根据专家负载的逐层更新机制，当某一层专家的前后两次的负载相似度小于`eplb_update_threshold`时选择更新该层，默认值为1，取之范围为(0,1)。
+- `eplb_update_interval`为专家分布更新时间间隔，单位为妙，默认值为1000.
+- 专家分布更新采用根据专家负载的逐层更新机制，当某一层专家的前后两次的负载相似度小于`eplb_update_interval`时选择更新该层，默认值为1，取之范围为(0,1)。
 
 ```bash
   --enable_eplb=true 
   --expert_parallel_degree=2 
   --ep_size=16  
-  --eplb_update_rate=2000
+  --eplb_update_interval=2000
   --eplb_update_threshold=0.9
 ```
 
 
@@ -118,7 +118,11 @@ DEFINE_string(communication_backend, "hccl", "npu communication backend.");
 
 DEFINE_bool(enable_eplb, false, "Whether to use ep load balance.");
 
-DEFINE_int64(eplb_update_rate, 1000, "eplb update rate.");
+DEFINE_int32(redundant_experts_num,
+             1,
+             "num of redundant experts on per device.");
+
+DEFINE_int64(eplb_update_interval, 1000, "eplb update rate.");
 
 DEFINE_double(eplb_update_threshold, 0.8, "eplb update threshold.");
 
 
@@ -69,7 +69,9 @@ DECLARE_string(communication_backend);
 
 DECLARE_bool(enable_eplb);
 
-DECLARE_int64(eplb_update_rate);
+DECLARE_int32(redundant_experts_num);
+
+DECLARE_int64(eplb_update_interval);
 
 DECLARE_double(eplb_update_threshold);
 
 
@@ -72,7 +72,9 @@ class Options {
 
   PROPERTY(std::optional<bool>, enable_eplb);
 
-  PROPERTY(std::optional<int64_t>, eplb_update_rate);
+  PROPERTY(std::optional<int32_t>, redundant_experts_num);
+
+  PROPERTY(std::optional<int64_t>, eplb_update_interval);
 
   PROPERTY(std::optional<double>, eplb_update_threshold);
 
 
@@ -253,8 +253,16 @@ struct JsonTool {
 };
 
 struct EplbInfo {
+  // Target layer ID for new expert weight pre-loading (-1 = no pending load)
+  // Values >=0 indicate the layer ID that should start loading new expert
+  // weights
   int32_t prepare_layer_id = -1;
+  // Expert IDs requiring updates, ordered by device shard assignment
+  // Contains per-device expert indices for distributed weight updates
   std::vector<int32_t> expert_ids;
+  // Layer ID ready for expert weight activation (-1 = no pending update)
+  // Values >=0 indicate the layer ID whose pre-loaded weights are ready for
+  // deployment
   int32_t update_layer_id = -1;
 };
 
 
@@ -18,8 +18,16 @@ class EplbExecutor final {
   EplbExecutor(CausalLM* model);
 
   virtual ~EplbExecutor();
+
+  // Reset the ready layer ID marker to -1 (no layer ready)
   void reset_ready_layer_id();
+
+  // Get the currently ready layer ID
+  // return int32_t Layer ID with prepared weights (-1 if none)
   int32_t get_ready_layer_id() const;
+
+  // Execute EPLB operation based on coordination info
+  // param eplb_info Contains layer preparation/activation instructions
   void eplb_execute(const EplbInfo& eplb_info);
 
  private:
 
@@ -20,17 +20,18 @@ namespace xllm {
 
 using namespace std::chrono_literals;
 
-EplbManager::EplbManager(EplbPolicy* eplb_policy,
-                         int32_t layer_num,
+EplbManager::EplbManager(int32_t layer_num,
                          int32_t device_num,
                          int32_t experts_num)
-    : eplb_policy_(eplb_policy),
-      layer_num_(layer_num),
+    : layer_num_(layer_num),
       device_num_(device_num),
       experts_num_(experts_num),
-      device_experts_num_((experts_num + device_num) / device_num) {
+      device_experts_num_(experts_num / device_num +
+                          FLAGS_redundant_experts_num) {
   // Initialize tensors with mutex protection
   {
+    eplb_policy_ = std::make_unique<EplbPolicy>(
+        device_experts_num_, device_num_, layer_num_);
     std::lock_guard<std::mutex> lock(state_.mtx);
     state_.expert_load =
         torch::zeros({layer_num_, experts_num_}, torch::kInt64);
@@ -39,11 +40,13 @@ EplbManager::EplbManager(EplbPolicy* eplb_policy,
         {layer_num_, device_num_, device_experts_num_}, torch::kInt32);
     for (int32_t layer = 0; layer < layer_num_; ++layer) {
       for (int32_t device = 0; device < device_num_; ++device) {
-        int32_t base = device * (device_experts_num_ - 1);
+        int32_t device_route_experts_num =
+            device_experts_num_ - FLAGS_redundant_experts_num;
+        int32_t base = device * device_route_experts_num;
         for (int32_t expert = 0; expert < device_experts_num_; ++expert) {
           int32_t value = base + expert;
-          if (expert == device_experts_num_ - 1) {
-            --value;
+          if (expert >= device_route_experts_num) {
+            value = base + device_route_experts_num - 1;
           }
           state_.expert_distribution[layer][device][expert] = value;
         }
@@ -105,7 +108,6 @@ void EplbManager::aggregate_multi_layer_expert_loads(
       layer_ids.emplace_back(ids.flatten().to(torch::kInt64));
       layer_loads.emplace_back(loads.flatten().to(torch::kInt64));
     }
-
     torch::Tensor all_ids = torch::cat(layer_ids);
     torch::Tensor all_loads = torch::cat(layer_loads);
     expert_load[layer].scatter_add_(0, all_ids, all_loads);
@@ -125,14 +127,12 @@ void EplbManager::rebalance_experts_loop() {
       if (state_.stop) return;
 
       while (!state_.expert_load_queue.empty()) {
-        // expert_load_batch.emplace_back(state_.expert_load_queue.front());
-        // state_.expert_load_queue.pop();
         aggregate_multi_layer_expert_loads(state_.expert_load,
                                            state_.expert_distribution,
                                            state_.expert_load_queue.front());
         state_.expert_load_queue.pop();
         int64_t current_time = absl::ToUnixSeconds(absl::Now());
-        if (current_time - latest_record_time >= FLAGS_eplb_update_rate) {
+        if (current_time - latest_record_time >= FLAGS_eplb_update_interval) {
           latest_record_time = current_time;
           auto result = eplb_policy_->rebalance_experts(state_.expert_load);
           state_.expert_distribution = result.first;
 
@@ -13,14 +13,24 @@ namespace xllm {
 
 class EplbManager {
  public:
-  EplbManager(EplbPolicy* eplb_policy,
-              int32_t layer_num,
-              int32_t device_num,
-              int32_t experts_num);
+  // Initialize with model dimensions:
+  // - layer_num: Total layers in the model
+  // - device_num: Parallel devices in cluster
+  // - experts_num: Experts per model layer
+  EplbManager(int32_t layer_num, int32_t device_num, int32_t experts_num);
+
   ~EplbManager();
 
+  // Feed new expert workload data for load balancing
+  // Input tensors should have shape [layer_num, experts_num]
   void update_expert_load(const std::vector<torch::Tensor> expert_load);
+
+  // Fetch current coordination instructions for expert updates
+  // Returns struct containing layer preparation/activation commands
   EplbInfo get_eplb_info();
+
+  // Mark specified layers as prepared (call after async loading completes)
+  // expert_layer_ids: Prepared layer IDs per device
   void set_prepared_layer_ids(const std::vector<int32_t>& expert_layer_ids);
 
  private:
@@ -49,7 +59,7 @@ class EplbManager {
   };
 
   // Components
-  EplbPolicy* eplb_policy_;
+  std::unique_ptr<EplbPolicy> eplb_policy_ = nullptr;
   ThreadSafeData state_;
 
   // Constants
 
@@ -15,7 +15,9 @@ EplbPolicy::EplbPolicy(int32_t device_experts_num,
       device_num_(device_num),
       layer_num_(layer_num) {
   old_expert_load_ =
-      torch::zeros({layer_num_, device_experts_num * device_num - device_num},
+      torch::zeros({layer_num_,
+                    device_experts_num * device_num -
+                        device_num * FLAGS_redundant_experts_num},
                    torch::kInt64);
   expert_distribution_ = torch::full(
       {layer_num_, device_num_, device_experts_num_}, -1, torch::kInt32);
@@ -32,9 +34,7 @@ std::pair<torch::Tensor, std::vector<bool>> EplbPolicy::rebalance_experts(
     auto prev_max_val = torch::max(prev_load).item<double>() + 1e-6f;
 
     current_load = (current_load / current_max_val).unsqueeze(0);
-    ;
     prev_load = (prev_load / prev_max_val).unsqueeze(0);
-    ;
 
     auto cos_sim =
         torch::nn::functional::cosine_similarity(
@@ -65,8 +65,8 @@ torch::Tensor EplbPolicy::compute_balanced_pack(
   const int64_t num_experts = expert_loads.size(0);
 
   // Generate Redundant Experts
-  auto [updated_weights, redundancy_map] =
-      update_origin_weights(expert_loads, device_num_);
+  auto [updated_weights, redundancy_map] = update_origin_weights(
+      expert_loads, device_num_ * FLAGS_redundant_experts_num);
 
   // Initialize Allocation Matrix
   auto options = torch::TensorOptions().dtype(torch::kInt64);