cont : handle variable V heads

ggerganov · ggerganov · commit 386425fe6ac3 · 2025-07-03T21:18:57.000+03:00
ggml-ci
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
@@ -65,26 +65,44 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
     return n_embd_head_v * n_head_kv;
 }
 
-bool llama_hparams::is_n_embd_k_gqa_homogeneous() const {
-    uint32_t val = n_embd_k_gqa();
+bool llama_hparams::is_n_embd_k_gqa_variable() const {
+    const uint32_t val = n_embd_k_gqa();
     for (uint32_t il = 0; il < n_layer; ++il) {
         if (val != n_embd_k_gqa(il)) {
-            return false;
+            return true;
         }
     }
 
-    return true;
+    return false;
 }
 
-bool llama_hparams::is_n_embd_v_gqa_homogeneous() const {
-    uint32_t val = n_embd_v_gqa();
+bool llama_hparams::is_n_embd_v_gqa_variable() const {
+    const uint32_t val = n_embd_v_gqa();
     for (uint32_t il = 0; il < n_layer; ++il) {
         if (val != n_embd_v_gqa(il)) {
-            return false;
+            return true;
         }
     }
 
-    return true;
+    return false;
+}
+
+uint32_t llama_hparams::n_embd_k_gqa_max() const {
+    uint32_t val = n_embd_k_gqa();
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        val = std::max(val, n_embd_k_gqa(il));
+    }
+
+    return val;
+}
+
+uint32_t llama_hparams::n_embd_v_gqa_max() const {
+    uint32_t val = n_embd_v_gqa();
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        val = std::max(val, n_embd_v_gqa(il));
+    }
+
+    return val;
 }
 
 uint32_t llama_hparams::n_embd_r() const {
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
@@ -189,9 +189,13 @@ struct llama_hparams {
     // dimension of value embeddings across all k-v heads
     uint32_t n_embd_v_gqa(uint32_t il = 0) const;
 
-    // true if all layers have the same n_embd_k_gqa/n_embd_v_gqa
-    bool is_n_embd_k_gqa_homogeneous() const;
-    bool is_n_embd_v_gqa_homogeneous() const;
+    // true if any layer has a different n_embd_k_gqa/n_embd_v_gqa
+    bool is_n_embd_k_gqa_variable() const;
+    bool is_n_embd_v_gqa_variable() const;
+
+    // return the maximum n_embd_k_gqa/n_embd_v_gqa across all layers
+    uint32_t n_embd_k_gqa_max() const;
+    uint32_t n_embd_v_gqa_max() const;
 
     // dimension of the rolling state embeddings
     // corresponds to Mamba's conv_states size or RWKV's token_shift states size
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -68,11 +68,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
 
     cells.resize(kv_size);
 
-    if (supports_set_rows) {
-        // TODO: this requirement can be relaxed, but it would be much easier to implement when we have an actual
-        //       model that needs this
-        // ref: https://github.com/ggml-org/llama.cpp/pull/14517
-        GGML_ASSERT(hparams.is_n_embd_v_gqa_homogeneous());
+    // [TAG_V_CACHE_VARIABLE]
+    if (v_trans && hparams.is_n_embd_v_gqa_variable()) {
+        LLAMA_LOG_WARN("%s: the V embeddings have different sizes across layers and FA is not enabled - padding V cache to %d\n",
+                __func__, hparams.n_embd_v_gqa_max());
     }
 
     for (uint32_t il = 0; il < n_layer_cache; il++) {
@@ -81,8 +80,9 @@ llama_kv_cache_unified::llama_kv_cache_unified(
             continue;
         }
 
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+        // [TAG_V_CACHE_VARIABLE]
+        const uint32_t n_embd_k_gqa =            hparams.n_embd_k_gqa(il);
+        const uint32_t n_embd_v_gqa = !v_trans ? hparams.n_embd_v_gqa(il) : hparams.n_embd_v_gqa_max();
 
         const char * dev_name = "CPU";
 
@@ -808,19 +808,19 @@ ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint
         // note: v->nb[1] <= v->nb[2]
         return ggml_view_4d(ctx, v,
                 hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, 1,
-                ggml_row_size(v->type, hparams.n_embd_head_v),            // v->nb[1]
-                ggml_row_size(v->type, hparams.n_embd_v_gqa(il)),         // v->nb[2]
-                ggml_row_size(v->type, hparams.n_embd_v_gqa(il)*kv_size), // v->nb[3]
-                ggml_row_size(v->type, hparams.n_embd_v_gqa(il)*kv_size)*0);
+                ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
+                ggml_row_size(v->type, v->ne[0]),              // v->nb[2]
+                ggml_row_size(v->type, v->ne[0]*kv_size),      // v->nb[3]
+                ggml_row_size(v->type, v->ne[0]*kv_size)*0);
     }
 
     // note: v->nb[1] > v->nb[2]
     return ggml_view_4d(ctx, v,
             n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, 1,
-            ggml_row_size(v->type, kv_size*hparams.n_embd_head_v),    // v->nb[1]
-            ggml_row_size(v->type, kv_size),                          // v->nb[2]
-            ggml_row_size(v->type, kv_size*hparams.n_embd_v_gqa(il)), // v->nb[3]
-            ggml_row_size(v->type, kv_size*hparams.n_embd_v_gqa(il))*0);
+            ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
+            ggml_row_size(v->type, kv_size),                       // v->nb[2]
+            ggml_row_size(v->type, kv_size*v->ne[0]),              // v->nb[3]
+            ggml_row_size(v->type, kv_size*v->ne[0])*0);
 }
 
 ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
@@ -856,8 +856,8 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_
 
     auto * v = layers[ikv].v;
 
-    const int64_t n_embd_v_gqa = v->ne[0];
-    const int64_t n_tokens = v_cur->ne[2];
+    const int64_t n_embd_v_gqa = v_cur->ne[0]*v_cur->ne[1];
+    const int64_t n_tokens     = v_cur->ne[2];
 
     v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);
 
@@ -870,6 +870,11 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_
             return ggml_set_rows(ctx, v, v_cur, v_idxs);
         }
 
+        // [TAG_V_CACHE_VARIABLE]
+        if (n_embd_v_gqa < v->ne[0]) {
+            v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
+        }
+
         // the row becomes a single element
         ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
 
@@ -916,7 +921,7 @@ ggml_tensor * llama_kv_cache_unified::build_input_v_idxs(ggml_context * ctx, con
     if (!v_trans) {
         v_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens);
     } else {
-        v_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens*hparams.n_embd_v_gqa());
+        v_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens*hparams.n_embd_v_gqa_max());
     }
 
     ggml_set_input(v_idxs);
@@ -957,7 +962,7 @@ void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_uba
         // note: the V cache is transposed when not using flash attention
         const int64_t kv_size = get_size();
 
-        const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+        const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa_max();
 
         for (uint32_t i = 0; i < n_tokens; ++i) {
             for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {