Support phi4 mini (#3467)

RunningLeon · web-flow · commit 374f67b95b06 · 2025-04-23T17:21:08.000+08:00
* support partial_rotary

* support phi-4

* update doc
diff --git a/README.md b/README.md
@@ -141,6 +141,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
   <li>Phi-3-mini (3.8B)</li>
   <li>Phi-3.5-mini (3.8B)</li>
   <li>Phi-3.5-MoE (16x3.8B)</li>
+  <li>Phi-4-mini (3.8B)</li>
   <li>MiniCPM3 (4B)</li>
 </ul>
 </td>
diff --git a/README_ja.md b/README_ja.md
@@ -139,6 +139,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>Phi-3-mini (3.8B)</li>
   <li>Phi-3.5-mini (3.8B)</li>
   <li>Phi-3.5-MoE (16x3.8B)</li>
+  <li>Phi-4-mini (3.8B)</li>
   <li>MiniCPM3 (4B)</li>
 </ul>
 </td>
diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -143,6 +143,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>Phi-3-mini (3.8B)</li>
   <li>Phi-3.5-mini (3.8B)</li>
   <li>Phi-3.5-MoE (16x3.8B)</li>
+  <li>Phi-4-mini (3.8B)</li>
   <li>MiniCPM3 (4B)</li>
 </ul>
 </td>
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
@@ -88,6 +88,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |           StarCoder2           |     3B-15B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |           Phi-3-mini           |      3.8B       | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |          Phi-3-vision          |      4.2B       | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|           Phi-4-mini           |      3.8B       | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |          CogVLM-Chat           |       17B       | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |          CogVLM2-Chat          |       19B       | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 | LLaVA(1.5,1.6)<sup>\[2\]</sup> |     7B-34B      | MLLM |    No     |   No    |   No    |  No  |  No   |
@@ -104,6 +105,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |          Phi-3.5-mini          |      3.8B       | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |          Phi-3.5-MoE           |     16x3.8B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |         Phi-3.5-vision         |      4.2B       | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
+|           Phi-4-mini           |      3.8B       | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
 * [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
@@ -88,6 +88,7 @@
 |           StarCoder2           |     3B-15B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |           Phi-3-mini           |      3.8B       | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |          Phi-3-vision          |      4.2B       | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|           Phi-4-mini           |      3.8B       | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |          CogVLM-Chat           |       17B       | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |          CogVLM2-Chat          |       19B       | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 | LLaVA(1.5,1.6)<sup>\[2\]</sup> |     7B-34B      | MLLM |    No     |   No    |   No    |  No  |  No   |
@@ -104,6 +105,7 @@
 |          Phi-3.5-mini          |      3.8B       | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |          Phi-3.5-MoE           |     16x3.8B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |         Phi-3.5-vision         |      4.2B       | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
+|           Phi-4-mini           |      3.8B       | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
 * [1] 目前，Mono-InternVL不支持FP16，因为数值不稳定。请改用BF16
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
@@ -1633,6 +1633,7 @@ def match(cls, model_path: str) -> Optional[str]:
             return 'internvl-zh-hermes2'
 
 
+@MODELS.register_module(name='phi-4')
 @MODELS.register_module(name='phi-3')
 class Phi3Instruct(BaseChatTemplate):
     """Chat template of InternLM model."""
@@ -1669,6 +1670,8 @@ def match(cls, model_path: str) -> Optional[str]:
         path = model_path.lower()
         if all([c in path for c in ['phi-3', 'instruct']]):
             return 'phi-3'
+        if all([c in path for c in ['phi-4', 'instruct']]):
+            return 'phi-4'
 
 
 @MODELS.register_module(name='internvl2-phi3')
diff --git a/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py
@@ -124,7 +124,16 @@ def apply_rotary_pos_emb(q: Tensor,
 
     seq_len = cos.numel() // cos.size(-1)
     BLOCK = 16
-    half_size = q.size(-1) // 2
+
+    if q.size(-1) == cos.size(-1):
+        half_size = q.size(-1) // 2
+    elif q.size(-1) > cos.size(-1):
+        # only do rope with rope_dim size
+        half_size = cos.size(-1) // 2
+    else:
+        raise ValueError('Not support head_dim < rope_dim, '
+                         f'but given head_dim={q.size(-1)} '
+                         f'rope_dim={cos.size(-1)}')
     BLOCK_N = triton.next_power_of_2(half_size)
     num_heads_q = q.size(-2)
     num_heads_k = k.size(-2)
diff --git a/lmdeploy/pytorch/models/phi3.py b/lmdeploy/pytorch/models/phi3.py
@@ -232,6 +232,7 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
         rope_max_pos_emb = config.max_position_embeddings
         rope_base = config.rope_theta
         rope_scaling = config.rope_scaling
+        partial_rotary_factor = getattr(config, 'partial_rotary_factor', None)
         if rope_scaling is not None:
             scaling_type = rope_scaling['type']
             assert scaling_type in ['longrope', 'su']
@@ -246,13 +247,15 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
                 rope_base,
                 longrope_params=longrope_params,
                 emb_type=emb_type,
+                partial_rotary_factor=partial_rotary_factor,
             )
         else:
             self.rotary_emb = build_rotary_embedding(
                 rope_dim,
                 rope_max_pos_emb,
                 rope_base,
                 emb_type=emb_type,
+                partial_rotary_factor=partial_rotary_factor,
             )
 
     def forward(
@@ -348,6 +351,11 @@ def get_logits(self, hidden_states: torch.Tensor):
         """compute logits of the model output."""
         return self.lm_head(hidden_states)
 
+    def update_weights(self):
+        """update weights."""
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
     def get_input_embeddings(self):
         """get input embeddings."""
         return self.model.get_input_embeddings()
diff --git a/lmdeploy/pytorch/nn/rotary_embedding.py b/lmdeploy/pytorch/nn/rotary_embedding.py
@@ -85,11 +85,16 @@ def build_rotary_embedding(dim: int,
                            yarn_params: YarnParameters = None,
                            longrope_params: LongRoPEScalingParameters = None,
                            llama3_params: Llama3Parameters = None,
-                           emb_type: RopeType = RopeType.Default) -> nn.Module:
+                           emb_type: RopeType = RopeType.Default,
+                           partial_rotary_factor: float = None) -> nn.Module:
     """build rotary embedding op."""
     backend = get_backend()
 
     builder = backend.get_layer_impl_builder(OpType.RotaryEmbedding)
+
+    # update rope_dim
+    if partial_rotary_factor is not None:
+        dim = int(dim * partial_rotary_factor)
     return builder.build(dim,
                          max_position_embeddings,
                          base,