From f0ccbbcab4eec081959df7a3c428db417228d811 Mon Sep 17 00:00:00 2001
From: bstnfr <bastien-bouge@hotmail.fr>
Date: Fri, 24 Apr 2026 12:32:01 +0200
Subject: [PATCH 1/5] deps: bump mlx-lm to >=0.31.3 and mlx to >=0.31.2

mlx-lm 0.31.3 requires mlx>=0.31.2 on Darwin per its published
metadata. Bump the lower bounds to match what's actually needed
at runtime. Installed versions unchanged (mlx 0.31.2, mlx-lm 0.31.3).
Test suite: 43 passed, 1 skipped.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index fba85a3..b175dab 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,8 +11,8 @@ license = {text = "MIT"}
 requires-python = ">=3.10"
 authors = [{name = "bstnxbt"}]
 dependencies = [
-    "mlx>=0.25.0",
-    "mlx-lm>=0.31.0",
+    "mlx>=0.31.2",
+    "mlx-lm>=0.31.3",
 ]
 
 [project.urls]

From 2f9c2508205f71d8573d8a4be17e1b1a685118bf Mon Sep 17 00:00:00 2001
From: bstnfr <bastien-bouge@hotmail.fr>
Date: Fri, 24 Apr 2026 13:01:59 +0200
Subject: [PATCH 2/5] bench: timestamp result filenames to preserve history

Previously dflash-benchmark would overwrite benchmark/results/<chip>/<name>.json
on every run. Append UTC YYYYMMDDTHHMMSSZ to the basename so repeated runs
never lose data and deltas across branches/commits stay traceable.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmark/benchmark.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index 1e03b41..eb6aac4 100644
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -112,6 +112,9 @@ def _default_results_path(
     if draft_quant:
         slug = re.sub(r"[^a-z0-9]+", "-", draft_quant.lower()).strip("-")
         name = f"{name}-dq-{slug}"
+    # Timestamp every run so repeated benches never overwrite history.
+    ts = time.strftime("%Y%m%dT%H%M%SZ", time.gmtime())
+    name = f"{name}-{ts}"
     folder = _slugify_chip(chip) if chip else "unknown-chip"
     return Path("benchmark/results") / folder / f"{name}.json"
 

From a4650ced5a690a448e75392cb7ffed0c5913bd66 Mon Sep 17 00:00:00 2001
From: "clandestine.eth" <96172957+0xClandestine@users.noreply.github.com>
Date: Sat, 25 Apr 2026 17:11:18 -0400
Subject: [PATCH 3/5] feat: add modular architecture system supporting all
 DFlash models

- New archs/ directory with pluggable architecture system
- Supports Qwen3 (dense + MoE), Llama, Gemma architectures
- Handles both standard config and Gemma-style speculator config
- Updated DRAFT_REGISTRY with 16 models from z-lab and RedHatAI
- Backward compatibility maintained via model.py wrapper

Models now supported:
- z-lab: Qwen3.5-4B/9B/27B/35B-A3B/122B-A10B, Qwen3-4B/8B,
  Qwen3.6-27B/35B-A3B, Qwen3-Coder-Next/30B-A3B, Kimi-K2.5,
  Llama-3.1-8B-Instruct, GPT-OSS-20B/120B
- RedHatAI: Gemma-4-31B-it
---
 dflash_mlx/archs/__init__.py |  53 +++++
 dflash_mlx/archs/base.py     | 392 ++++++++++++++++++++++++++++++++
 dflash_mlx/archs/llama.py    | 428 +++++++++++++++++++++++++++++++++++
 dflash_mlx/archs/qwen3.py    | 412 +++++++++++++++++++++++++++++++++
 dflash_mlx/generate.py       |  30 ++-
 dflash_mlx/model.py          | 343 ++++++----------------------
 dflash_mlx/runtime.py        |  19 +-
 7 files changed, 1390 insertions(+), 287 deletions(-)
 create mode 100644 dflash_mlx/archs/__init__.py
 create mode 100644 dflash_mlx/archs/base.py
 create mode 100644 dflash_mlx/archs/llama.py
 create mode 100644 dflash_mlx/archs/qwen3.py

diff --git a/dflash_mlx/archs/__init__.py b/dflash_mlx/archs/__init__.py
new file mode 100644
index 0000000..0dfa0cb
--- /dev/null
+++ b/dflash_mlx/archs/__init__.py
@@ -0,0 +1,53 @@
+# Copyright 2026 bstnxbt
+# MIT License — see LICENSE file
+# Based on DFlash (arXiv:2602.06036)
+
+"""
+DFlash architecture modular system.
+
+This module provides a pluggable architecture system supporting multiple
+model architectures (Qwen3, Llama/Gemma, etc.) with custom attention,
+MLP, normalization, and RoPE implementations.
+"""
+
+from dflash_mlx.archs.base import (
+    DFlashAttention,
+    DFlashArgs,
+    DFlashCache,
+    DFlashDecoderLayer,
+    DFlashMLP,
+    DFlashModel,
+    DFlashNorm,
+    DFlashRope,
+    create_dflash_model,
+    extract_context_feature,
+    get_architecture_for_model_type,
+    list_supported_architectures,
+    register_architecture,
+)
+from dflash_mlx.archs.qwen3 import Qwen3DFlashModel, Qwen3DFlashAttention, Qwen3DFlashMLP
+from dflash_mlx.archs.llama import LlamaDFlashModel, LlamaDFlashAttention, LlamaDFlashMLP
+
+__all__ = [
+    # Base classes
+    "DFlashArgs",
+    "DFlashModel",
+    "DFlashAttention",
+    "DFlashMLP",
+    "DFlashNorm",
+    "DFlashRope",
+    "DFlashCache",
+    "DFlashDecoderLayer",
+    # Factory functions
+    "create_dflash_model",
+    "get_architecture_for_model_type",
+    "list_supported_architectures",
+    "register_architecture",
+    # Architecture implementations
+    "Qwen3DFlashModel",
+    "Qwen3DFlashAttention",
+    "Qwen3DFlashMLP",
+    "LlamaDFlashModel",
+    "LlamaDFlashAttention",
+    "LlamaDFlashMLP",
+]
\ No newline at end of file
diff --git a/dflash_mlx/archs/base.py b/dflash_mlx/archs/base.py
new file mode 100644
index 0000000..5a7aaaa
--- /dev/null
+++ b/dflash_mlx/archs/base.py
@@ -0,0 +1,392 @@
+# Copyright 2026 bstnxbt
+# MIT License — see LICENSE file
+# Based on DFlash (arXiv:2602.06036)
+
+"""
+Base protocols and abstractions for DFlash architecture system.
+
+This module defines the interfaces that each architecture must implement,
+enabling a pluggable system for supporting multiple model architectures.
+"""
+
+from __future__ import annotations
+
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any, Callable, ClassVar, Optional, Protocol, Type, TypeVar, runtime_checkable
+
+import mlx.core as mx
+import mlx.nn as nn
+
+
+# =============================================================================
+# Protocol Definitions (interfaces for architecture implementations)
+# =============================================================================
+
+
+@runtime_checkable
+class DFlashNorm(Protocol):
+    """Normalization layer protocol."""
+
+    def __call__(self, x: mx.array) -> mx.array:
+        """Apply normalization."""
+        ...
+
+
+@runtime_checkable
+class DFlashRope(Protocol):
+    """Rotary Positional Embedding protocol."""
+
+    def __call__(
+        self,
+        x: mx.array,
+        *,
+        offset: int = 0,
+    ) -> mx.array:
+        """Apply RoPE with given offset."""
+        ...
+
+
+@runtime_checkable
+class DFlashAttention(Protocol):
+    """Attention layer protocol for DFlash cross-attention."""
+
+    n_heads: int
+    n_kv_heads: int
+    head_dim: int
+    scale: float
+
+    def __init__(
+        self,
+        args: DFlashArgs,
+    ) -> None:
+        """Initialize attention with model arguments."""
+        ...
+
+    def __call__(
+        self,
+        hidden_states: mx.array,
+        *,
+        target_hidden: mx.array,
+        cache: Optional[DFlashCache] = None,
+    ) -> mx.array:
+        """Forward pass with cross-attention to target hidden states."""
+        ...
+
+
+@runtime_checkable
+class DFlashMLP(Protocol):
+    """MLP/feed-forward network protocol."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+    ) -> None:
+        """Initialize MLP."""
+        ...
+
+    def __call__(self, x: mx.array) -> mx.array:
+        """Forward pass."""
+        ...
+
+
+@runtime_checkable
+class DFlashCache(Protocol):
+    """KV cache protocol for attention layers."""
+
+    offset: int
+
+    def append_context(
+        self,
+        context_keys: mx.array,
+        context_values: mx.array,
+        num_positions: int,
+    ) -> None:
+        """Append context KV to cache."""
+        ...
+
+    def fetch(self) -> tuple[Optional[mx.array], Optional[mx.array]]:
+        """Fetch cached keys and values."""
+        ...
+
+    def update_and_fetch(
+        self,
+        keys: mx.array,
+        values: mx.array,
+    ) -> tuple[mx.array, mx.array]:
+        """Update cache with new keys/values and fetch."""
+        ...
+
+    def cache_length(self) -> int:
+        """Get current cache length."""
+        ...
+
+
+@runtime_checkable
+class DFlashDecoderLayer(Protocol):
+    """Single decoder layer protocol."""
+
+    def __call__(
+        self,
+        hidden_states: mx.array,
+        *,
+        target_hidden: mx.array,
+        cache: Optional[DFlashCache] = None,
+    ) -> mx.array:
+        """Forward pass through decoder layer."""
+        ...
+
+
+@runtime_checkable
+class DFlashModel(Protocol):
+    """Full DFlash draft model protocol."""
+
+    model_type: str
+    target_layer_ids: list[int]
+    block_size: int
+    mask_token_id: int
+    args: DFlashArgs
+
+    def __call__(
+        self,
+        *,
+        noise_embedding: mx.array,
+        target_hidden: mx.array,
+        cache: Optional[list[Optional[DFlashCache]]] = None,
+    ) -> mx.array:
+        """Forward pass through the full model."""
+        ...
+
+    def sanitize(self, weights: dict[str, mx.array]) -> dict[str, mx.array]:
+        """Sanitize model weights after loading."""
+        ...
+
+
+# =============================================================================
+# Model Arguments Dataclass
+# =============================================================================
+
+
+@dataclass
+class DFlashArgs:
+    """Configuration arguments for DFlash draft models."""
+
+    model_type: str
+    hidden_size: int
+    num_hidden_layers: int
+    intermediate_size: int
+    num_attention_heads: int
+    rms_norm_eps: float
+    vocab_size: int
+    num_key_value_heads: int
+    max_position_embeddings: int
+    rope_theta: float
+    head_dim: int
+    tie_word_embeddings: bool
+    num_target_layers: int
+    block_size: int
+    attention_bias: bool = False
+    attention_dropout: float = 0.0
+    rope_scaling: Optional[dict[str, Any]] = None
+    layer_types: tuple[str, ...] = ()
+    dflash_config: dict[str, Any] = field(default_factory=dict)
+
+    # Architecture-specific attributes (set by architecture implementation)
+    architecture: Optional[str] = None
+
+    @classmethod
+    def from_dict(cls, params: dict[str, Any]) -> "DFlashArgs":
+        """Create args from config dictionary.
+        
+        Handles both standard DFlash config and Gemma-style speculator config
+        (which uses transformer_layer_config to embed the actual model config).
+        """
+        data = dict(params)
+        data["layer_types"] = tuple(data.get("layer_types") or ())
+        data["dflash_config"] = dict(data.get("dflash_config") or {})
+
+        # Handle Gemma-style config with embedded transformer_layer_config
+        transformer_config = data.get("transformer_layer_config", {})
+        if transformer_config:
+            # This is a Gemma-style speculator config
+            # Extract model params from the embedded transformer config
+            for key in [
+                "hidden_size", "num_hidden_layers", "intermediate_size",
+                "num_attention_heads", "rms_norm_eps", "vocab_size",
+                "num_key_value_heads", "max_position_embeddings",
+                "rope_theta", "head_dim", "tie_word_embeddings",
+                "attention_bias", "attention_dropout",
+            ]:
+                if key in transformer_config and key not in data:
+                    data[key] = transformer_config[key]
+            
+            # Get rope parameters
+            rope_params = transformer_config.get("rope_parameters", {})
+            if rope_params and "rope_theta" not in data:
+                data["rope_theta"] = rope_params.get("rope_theta", 1e6)
+            
+            # Set model type from transformer config
+            if "model_type" not in data:
+                data["model_type"] = transformer_config.get("model_type", "llama")
+            
+            # Extract dflash_config from top-level if not present
+            if "dflash_config" not in data or not data["dflash_config"]:
+                data["dflash_config"] = {
+                    "target_layer_ids": data.get("aux_hidden_state_layer_ids"),
+                    "mask_token_id": data.get("mask_token_id"),
+                }
+            
+            # Set num_target_layers
+            if "num_target_layers" not in data:
+                data["num_target_layers"] = transformer_config.get("num_hidden_layers", 62)
+
+        # Determine architecture from model_type or config
+        model_type = data.get("model_type", "")
+        arch = _infer_architecture(model_type, data)
+        data["architecture"] = arch
+
+        return cls(
+            **{key: value for key, value in data.items() if key in cls.__annotations__}
+        )
+
+
+def _infer_architecture(model_type: str, config: dict[str, Any]) -> str:
+    """Infer the architecture name from model type and config."""
+    model_type_lower = model_type.lower()
+
+    # Check for Llama-based models (Gemma, Llama, etc.)
+    if "llama" in model_type_lower or "gemma" in model_type_lower:
+        return "llama"
+
+    # Check for Qwen models
+    if "qwen" in model_type_lower:
+        return "qwen3"
+
+    # Check transformer_layer_config for Llama (Gemma spec format)
+    transformer_config = config.get("transformer_layer_config", {})
+    if transformer_config:
+        inner_type = transformer_config.get("model_type", "").lower()
+        if "llama" in inner_type or "gemma" in inner_type:
+            return "llama"
+
+    # Default to qwen3 for backward compatibility
+    return "qwen3"
+
+
+# =============================================================================
+# Architecture Registry
+# =============================================================================
+
+
+@dataclass
+class ArchitectureSpec:
+    """Specification for a DFlash architecture implementation."""
+
+    name: str
+    model_class: Type[DFlashModel]
+    attention_class: Type[DFlashAttention]
+    mlp_class: Type[DFlashMLP]
+    norm_class: Optional[Type[DFlashNorm]] = None
+    rope_class: Optional[Type[DFlashRope]] = None
+    cache_class: Optional[Type[DFlashCache]] = None
+    # Patterns that identify this architecture in model type strings
+    model_type_patterns: tuple[str, ...] = ()
+
+
+class ArchitectureRegistry:
+    """Registry for DFlash architecture implementations."""
+
+    _architectures: ClassVar[dict[str, ArchitectureSpec]] = {}
+    _fallback: ClassVar[Optional[Type[DFlashModel]]] = None
+
+    @classmethod
+    def register(cls, spec: ArchitectureSpec) -> None:
+        """Register an architecture implementation."""
+        cls._architectures[spec.name] = spec
+        for pattern in spec.model_type_patterns:
+            cls._architectures[pattern] = spec
+
+    @classmethod
+    def get(cls, name: str) -> Optional[ArchitectureSpec]:
+        """Get architecture by name."""
+        return cls._architectures.get(name)
+
+    @classmethod
+    def get_for_model_type(cls, model_type: str) -> ArchitectureSpec:
+        """Get the appropriate architecture for a model type."""
+        model_type_lower = model_type.lower()
+
+        # Direct match
+        if model_type_lower in cls._architectures:
+            return cls._architectures[model_type_lower]
+
+        # Pattern matching
+        for name, spec in cls._architectures.items():
+            if name in model_type_lower:
+                return spec
+
+        # Fallback to qwen3
+        if "qwen3" in cls._architectures:
+            return cls._architectures["qwen3"]
+
+        raise ValueError(f"No architecture found for model type: {model_type}")
+
+
+def register_architecture(spec: ArchitectureSpec) -> None:
+    """Register a DFlash architecture implementation."""
+    ArchitectureRegistry.register(spec)
+
+
+def get_architecture_for_model_type(model_type: str) -> ArchitectureSpec:
+    """Get the appropriate architecture for a model type."""
+    return ArchitectureRegistry.get_for_model_type(model_type)
+
+
+def list_supported_architectures() -> list[str]:
+    """List all supported architecture names."""
+    return list(set(ArchitectureRegistry._architectures.keys()))
+
+
+# =============================================================================
+# Model Factory
+# =============================================================================
+
+
+def create_dflash_model(config: dict[str, Any]) -> DFlashModel:
+    """
+    Create a DFlash model from configuration.
+
+    Args:
+        config: Model configuration dictionary (from config.json)
+
+    Returns:
+        Instance of the appropriate DFlash model for the architecture
+    """
+    args = DFlashArgs.from_dict(config)
+    arch_spec = get_architecture_for_model_type(args.model_type)
+    return arch_spec.model_class(args)
+
+
+# Backward compatibility - export the original class names
+def build_target_layer_ids(num_target_layers: int, num_draft_layers: int) -> list[int]:
+    """Build default target layer IDs for draft model."""
+    if num_draft_layers <= 1:
+        return [num_target_layers // 2]
+    start = 1
+    end = num_target_layers - 3
+    span = end - start
+    return [
+        int(round(start + (index * span) / (num_draft_layers - 1)))
+        for index in range(num_draft_layers)
+    ]
+
+
+def extract_context_feature(
+    hidden_states: list[mx.array],
+    layer_ids: list[int],
+) -> mx.array:
+    """Extract and concatenate hidden states at specified layer IDs."""
+    selected = [hidden_states[layer_id + 1] for layer_id in layer_ids]
+    return mx.concatenate(selected, axis=-1)
\ No newline at end of file
diff --git a/dflash_mlx/archs/llama.py b/dflash_mlx/archs/llama.py
new file mode 100644
index 0000000..a0c1b14
--- /dev/null
+++ b/dflash_mlx/archs/llama.py
@@ -0,0 +1,428 @@
+# Copyright 2026 bstnxbt
+# MIT License — see LICENSE file
+# Based on DFlash (arXiv:2602.06036)
+
+"""
+Llama/Gemma DFlash architecture implementation.
+
+This module provides the DFlash draft model implementation for Llama-based models
+(including Gemma, Llama, and other variants that use Llama-style architecture).
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_lm.models.base import scaled_dot_product_attention
+from mlx_lm.models.llama import MLP as LlamaMLP
+from mlx_lm.models.rope_utils import initialize_rope
+
+from dflash_mlx.archs.base import (
+    DFlashArgs,
+    DFlashAttention,
+    DFlashCache,
+    DFlashDecoderLayer,
+    DFlashModel,
+    DFlashMLP,
+    DFlashNorm,
+    DFlashRope,
+    ArchitectureSpec,
+    build_target_layer_ids,
+    register_architecture,
+)
+
+
+# =============================================================================
+# Llama-specific Norm (standard RMSNorm)
+# =============================================================================
+
+
+class LlamaDFlashNorm(nn.RMSNorm):
+    """Llama-style RMSNorm implementation."""
+
+    pass
+
+
+# =============================================================================
+# Llama RoPE (slightly different from Qwen3)
+# =============================================================================
+
+
+class LlamaDFlashRope:
+    """Llama-style RoPE implementation."""
+
+    def __init__(
+        self,
+        head_dim: int,
+        base: float,
+        max_position_embeddings: int,
+        scaling_config: Optional[dict[str, Any]] = None,
+    ):
+        # Llama uses traditional RoPE (not the "next" variant)
+        self.rope = initialize_rope(
+            head_dim,
+            base=base,
+            traditional=True,  # Different from Qwen3
+            scaling_config=scaling_config,
+            max_position_embeddings=max_position_embeddings,
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        *,
+        offset: int = 0,
+    ) -> mx.array:
+        return self.rope(x, offset=offset)
+
+
+# =============================================================================
+# Llama Attention (no Q/K normalization - key difference from Qwen3)
+# =============================================================================
+
+
+class LlamaDFlashAttention(nn.Module, DFlashAttention):
+    """
+    Llama-style DFlash cross-attention layer.
+
+    Unlike Qwen3, Llama models do NOT use Q/K normalization.
+    This is the key architectural difference.
+    """
+
+    def __init__(self, args: DFlashArgs):
+        super().__init__()
+        dim = args.hidden_size
+        self.n_heads = args.num_attention_heads
+        self.n_kv_heads = args.num_key_value_heads
+        self.head_dim = args.head_dim
+        self.scale = self.head_dim**-0.5
+
+        self.q_proj = nn.Linear(dim, self.n_heads * self.head_dim, bias=args.attention_bias)
+        self.k_proj = nn.Linear(dim, self.n_kv_heads * self.head_dim, bias=args.attention_bias)
+        self.v_proj = nn.Linear(dim, self.n_kv_heads * self.head_dim, bias=args.attention_bias)
+        self.o_proj = nn.Linear(self.n_heads * self.head_dim, dim, bias=args.attention_bias)
+
+        # Note: No Q/K normalization for Llama - key difference from Qwen3
+
+        self.rope = LlamaDFlashRope(
+            self.head_dim,
+            base=args.rope_theta,
+            max_position_embeddings=args.max_position_embeddings,
+            scaling_config=args.rope_scaling,
+        )
+
+    def __call__(
+        self,
+        hidden_states: mx.array,
+        *,
+        target_hidden: mx.array,
+        cache: Optional[DFlashCache] = None,
+    ) -> mx.array:
+        batch, block_len, _ = hidden_states.shape
+        ctx_len = int(target_hidden.shape[1])
+
+        # Project queries (no Q norm for Llama)
+        queries = self.q_proj(hidden_states).reshape(
+            batch, block_len, self.n_heads, -1
+        ).transpose(0, 2, 1, 3)
+
+        # Project keys/values from concatenated target + noise
+        kv_states = mx.concatenate([target_hidden, hidden_states], axis=1)
+        all_keys = self.k_proj(kv_states).reshape(
+            batch, ctx_len + block_len, self.n_kv_heads, -1
+        ).transpose(0, 2, 1, 3)
+        all_values = self.v_proj(kv_states).reshape(
+            batch, ctx_len + block_len, self.n_kv_heads, -1
+        ).transpose(0, 2, 1, 3)
+
+        context_keys = all_keys[:, :, :ctx_len, :]
+        context_values = all_values[:, :, :ctx_len, :]
+        noise_keys = all_keys[:, :, ctx_len:, :]
+        noise_values = all_values[:, :, ctx_len:, :]
+
+        if cache is not None:
+            if isinstance(cache, LlamaContextOnlyCache):
+                cache_offset = int(cache.offset)
+                query_offset = cache_offset + ctx_len
+
+                queries = self.rope(queries, offset=query_offset)
+                context_keys = self.rope(context_keys, offset=cache_offset)
+                noise_keys = self.rope(noise_keys, offset=query_offset)
+
+                cache.append_context(context_keys, context_values, ctx_len)
+                cached_keys, cached_values = cache.fetch()
+                keys = mx.concatenate([cached_keys, noise_keys], axis=-2)
+                values = mx.concatenate([cached_values, noise_values], axis=-2)
+
+                output = scaled_dot_product_attention(
+                    queries,
+                    keys,
+                    values,
+                    cache=None,
+                    scale=self.scale,
+                    mask=None,
+                )
+            else:
+                cache_offset = int(getattr(cache, "offset", 0) or 0)
+                query_offset = cache_offset + ctx_len
+
+                queries = self.rope(queries, offset=query_offset)
+                context_keys = self.rope(context_keys, offset=cache_offset)
+                noise_keys = self.rope(noise_keys, offset=query_offset)
+
+                keys = mx.concatenate([context_keys, noise_keys], axis=-2)
+                values = mx.concatenate([context_values, noise_values], axis=-2)
+                keys, values = cache.update_and_fetch(keys, values)
+
+                output = scaled_dot_product_attention(
+                    queries,
+                    keys,
+                    values,
+                    cache=cache,
+                    scale=self.scale,
+                    mask=None,
+                )
+        else:
+            queries = self.rope(queries, offset=ctx_len)
+            context_keys = self.rope(context_keys, offset=0)
+            noise_keys = self.rope(noise_keys, offset=ctx_len)
+
+            # Try to use optimized DFlash kernel if available
+            if hasattr(mx.fast, "dflash_cross_attention"):
+                output = mx.fast.dflash_cross_attention(
+                    queries,
+                    context_keys,
+                    context_values,
+                    noise_keys,
+                    noise_values,
+                    scale=self.scale,
+                )
+            else:
+                keys = mx.concatenate([context_keys, noise_keys], axis=-2)
+                values = mx.concatenate([context_values, noise_values], axis=-2)
+                output = scaled_dot_product_attention(
+                    queries,
+                    keys,
+                    values,
+                    cache=None,
+                    scale=self.scale,
+                    mask=None,
+                )
+
+        output = output.transpose(0, 2, 1, 3).reshape(batch, block_len, -1)
+        return self.o_proj(output)
+
+
+class LlamaContextOnlyCache:
+    """Llama-specific context-only KV cache with sliding window."""
+
+    def __init__(self, sink_size: int = 64, window_size: int = 1024):
+        self.sink_size = int(sink_size)
+        self.window_size = int(window_size)
+        self.keys: Optional[mx.array] = None
+        self.values: Optional[mx.array] = None
+        self.offset: int = 0
+
+    def append_context(
+        self,
+        context_keys: mx.array,
+        context_values: mx.array,
+        num_positions: int,
+    ) -> None:
+        if context_keys is None or context_values is None or int(num_positions) <= 0:
+            return
+
+        if self.keys is None:
+            self.keys = context_keys
+            self.values = context_values
+        else:
+            self.keys = mx.concatenate([self.keys, context_keys], axis=2)
+            self.values = mx.concatenate([self.values, context_values], axis=2)
+
+        self.offset += int(num_positions)
+        self._apply_window()
+
+    def _apply_window(self) -> None:
+        if self.keys is None or self.values is None:
+            return
+
+        cache_len = int(self.keys.shape[2])
+        max_len = self.sink_size + self.window_size
+
+        if cache_len <= max_len:
+            return
+
+        sink_k = self.keys[:, :, : self.sink_size, :]
+        sink_v = self.values[:, :, : self.sink_size, :]
+        window_k = self.keys[:, :, -self.window_size :, :]
+        window_v = self.values[:, :, -self.window_size :, :]
+
+        self.keys = mx.concatenate([sink_k, window_k], axis=2)
+        self.values = mx.concatenate([sink_v, window_v], axis=2)
+
+    def fetch(self) -> tuple[Optional[mx.array], Optional[mx.array]]:
+        return self.keys, self.values
+
+    def cache_length(self) -> int:
+        if self.keys is None:
+            return 0
+        return int(self.keys.shape[2])
+
+
+# =============================================================================
+# Llama MLP (SwiGLU activation)
+# =============================================================================
+
+
+class LlamaDFlashMLP(nn.Module, DFlashMLP):
+    """
+    Llama-style MLP with SwiGLU activation.
+
+    Uses gated linear unit with SiLU activation:
+    output = down_proj(silu(gate_proj(x)) * up_proj(x))
+    """
+
+    def __init__(self, hidden_size: int, intermediate_size: int):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+
+    def __call__(self, x: mx.array) -> mx.array:
+        return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))
+
+
+# =============================================================================
+# Llama Decoder Layer
+# =============================================================================
+
+
+class LlamaDFlashDecoderLayer(nn.Module, DFlashDecoderLayer):
+    """Llama-style decoder layer."""
+
+    def __init__(self, args: DFlashArgs):
+        super().__init__()
+        self.self_attn = LlamaDFlashAttention(args)
+        self.mlp = LlamaDFlashMLP(args.hidden_size, args.intermediate_size)
+        self.input_layernorm = LlamaDFlashNorm(args.hidden_size, eps=args.rms_norm_eps)
+        self.post_attention_layernorm = LlamaDFlashNorm(args.hidden_size, eps=args.rms_norm_eps)
+
+    def __call__(
+        self,
+        hidden_states: mx.array,
+        *,
+        target_hidden: mx.array,
+        cache: Optional[DFlashCache] = None,
+    ) -> mx.array:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states,
+            target_hidden=target_hidden,
+            cache=cache,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        return residual + hidden_states
+
+
+# =============================================================================
+# Llama Full Model
+# =============================================================================
+
+
+class LlamaDFlashModel(nn.Module, DFlashModel):
+    """
+    Llama-based DFlash draft model.
+
+    This model takes noise token embeddings (from the target model's embed_tokens)
+    and target hidden states, and produces draft logits for block-diffusion
+    speculative decoding.
+
+    Designed for Llama, Gemma, and other Llama-architecture models.
+    """
+
+    def __init__(self, args: DFlashArgs):
+        super().__init__()
+        self.args = args
+        self.model_type = "dflash_llama"
+
+        # Create decoder layers
+        self.layers = [
+            LlamaDFlashDecoderLayer(args) for _ in range(args.num_hidden_layers)
+        ]
+
+        # Get target layer IDs from config or build defaults
+        target_layer_ids = list(args.dflash_config.get("target_layer_ids") or [])
+        self.target_layer_ids = target_layer_ids or build_target_layer_ids(
+            args.num_target_layers,
+            args.num_hidden_layers,
+        )
+
+        # Output projection
+        self.norm = LlamaDFlashNorm(args.hidden_size, eps=args.rms_norm_eps)
+
+        # Project concatenated target hidden states
+        self.fc = nn.Linear(
+            len(self.target_layer_ids) * args.hidden_size,
+            args.hidden_size,
+            bias=False,
+        )
+        self.hidden_norm = LlamaDFlashNorm(args.hidden_size, eps=args.rms_norm_eps)
+
+        self.block_size = int(args.block_size)
+        self.mask_token_id = int(args.dflash_config.get("mask_token_id", 0) or 0)
+
+    def _project_target_hidden(self, target_hidden: mx.array) -> mx.array:
+        """Project and normalize target hidden states."""
+        return self.hidden_norm(self.fc(target_hidden))
+
+    def __call__(
+        self,
+        *,
+        noise_embedding: mx.array,
+        target_hidden: mx.array,
+        cache: Optional[list[Optional[DFlashCache]]] = None,
+    ) -> mx.array:
+        hidden_states = noise_embedding
+        projected_hidden = self._project_target_hidden(target_hidden)
+
+        if cache is None:
+            cache = [None] * len(self.layers)
+
+        for layer, layer_cache in zip(self.layers, cache, strict=True):
+            hidden_states = layer(
+                hidden_states,
+                target_hidden=projected_hidden,
+                cache=layer_cache,
+            )
+
+        return self.norm(hidden_states)
+
+    def sanitize(self, weights: dict[str, mx.array]) -> dict[str, mx.array]:
+        """Sanitize model weights after loading."""
+        return weights
+
+
+# =============================================================================
+# Register Llama Architecture
+# =============================================================================
+
+
+llama_spec = ArchitectureSpec(
+    name="llama",
+    model_class=LlamaDFlashModel,
+    attention_class=LlamaDFlashAttention,
+    mlp_class=LlamaDFlashMLP,
+    norm_class=LlamaDFlashNorm,
+    rope_class=LlamaDFlashRope,
+    cache_class=LlamaContextOnlyCache,
+    model_type_patterns=("llama", "gemma", "mistral", "qwen1", "olmo", "gemma4"),
+)
+
+register_architecture(llama_spec)
\ No newline at end of file
diff --git a/dflash_mlx/archs/qwen3.py b/dflash_mlx/archs/qwen3.py
new file mode 100644
index 0000000..0859afc
--- /dev/null
+++ b/dflash_mlx/archs/qwen3.py
@@ -0,0 +1,412 @@
+# Copyright 2026 bstnxbt
+# MIT License — see LICENSE file
+# Based on DFlash (arXiv:2602.06036)
+
+"""
+Qwen3 DFlash architecture implementation.
+
+This module provides the DFlash draft model implementation for Qwen3-based models.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_lm.models.base import scaled_dot_product_attention
+from mlx_lm.models.qwen3 import MLP as Qwen3MLP
+from mlx_lm.models.rope_utils import initialize_rope
+
+from dflash_mlx.archs.base import (
+    DFlashArgs,
+    DFlashAttention,
+    DFlashCache,
+    DFlashDecoderLayer,
+    DFlashModel,
+    DFlashMLP,
+    DFlashNorm,
+    DFlashRope,
+    ArchitectureSpec,
+    build_target_layer_ids,
+    register_architecture,
+)
+
+
+# =============================================================================
+# Qwen3-specific Norm (RMSNorm with optional Qwen3 specifics)
+# =============================================================================
+
+
+class Qwen3DFlashNorm(nn.RMSNorm):
+    """Qwen3-specific RMSNorm implementation."""
+
+    pass
+
+
+class Qwen3DFlashRope:
+    """Qwen3-specific RoPE implementation."""
+
+    def __init__(
+        self,
+        head_dim: int,
+        base: float,
+        max_position_embeddings: int,
+        scaling_config: Optional[dict[str, Any]] = None,
+    ):
+        self.rope = initialize_rope(
+            head_dim,
+            base=base,
+            traditional=False,
+            scaling_config=scaling_config,
+            max_position_embeddings=max_position_embeddings,
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        *,
+        offset: int = 0,
+    ) -> mx.array:
+        return self.rope(x, offset=offset)
+
+
+# =============================================================================
+# Qwen3 Attention (with Q/K normalization)
+# =============================================================================
+
+
+class Qwen3DFlashAttention(nn.Module, DFlashAttention):
+    """
+    Qwen3-specific DFlash cross-attention layer.
+
+    This attention implementation includes Q/K normalization which is
+    specific to Qwen3 architecture.
+    """
+
+    def __init__(self, args: DFlashArgs):
+        super().__init__()
+        dim = args.hidden_size
+        self.n_heads = args.num_attention_heads
+        self.n_kv_heads = args.num_key_value_heads
+        self.head_dim = args.head_dim
+        self.scale = self.head_dim**-0.5
+
+        self.q_proj = nn.Linear(dim, self.n_heads * self.head_dim, bias=args.attention_bias)
+        self.k_proj = nn.Linear(dim, self.n_kv_heads * self.head_dim, bias=args.attention_bias)
+        self.v_proj = nn.Linear(dim, self.n_kv_heads * self.head_dim, bias=args.attention_bias)
+        self.o_proj = nn.Linear(self.n_heads * self.head_dim, dim, bias=args.attention_bias)
+
+        # Qwen3-specific Q/K normalization
+        self.q_norm = nn.RMSNorm(self.head_dim, eps=args.rms_norm_eps)
+        self.k_norm = nn.RMSNorm(self.head_dim, eps=args.rms_norm_eps)
+
+        self.rope = Qwen3DFlashRope(
+            self.head_dim,
+            base=args.rope_theta,
+            max_position_embeddings=args.max_position_embeddings,
+            scaling_config=args.rope_scaling,
+        )
+
+    def __call__(
+        self,
+        hidden_states: mx.array,
+        *,
+        target_hidden: mx.array,
+        cache: Optional[DFlashCache] = None,
+    ) -> mx.array:
+        batch, block_len, _ = hidden_states.shape
+        ctx_len = int(target_hidden.shape[1])
+
+        # Project and reshape queries
+        queries = self.q_proj(hidden_states)
+        queries = self.q_norm(
+            queries.reshape(batch, block_len, self.n_heads, -1)
+        ).transpose(0, 2, 1, 3)
+
+        # Fuse context and noise projections: 2 matmuls instead of 4
+        kv_states = mx.concatenate([target_hidden, hidden_states], axis=1)
+        all_keys = self.k_norm(
+            self.k_proj(kv_states).reshape(batch, ctx_len + block_len, self.n_kv_heads, -1)
+        ).transpose(0, 2, 1, 3)
+        all_values = self.v_proj(kv_states).reshape(
+            batch, ctx_len + block_len, self.n_kv_heads, -1
+        ).transpose(0, 2, 1, 3)
+
+        context_keys = all_keys[:, :, :ctx_len, :]
+        context_values = all_values[:, :, :ctx_len, :]
+        noise_keys = all_keys[:, :, ctx_len:, :]
+        noise_values = all_values[:, :, ctx_len:, :]
+
+        if cache is not None:
+            if isinstance(cache, Qwen3ContextOnlyCache):
+                cache_offset = int(cache.offset)
+                query_offset = cache_offset + ctx_len
+
+                queries = self.rope(queries, offset=query_offset)
+                context_keys = self.rope(context_keys, offset=cache_offset)
+                noise_keys = self.rope(noise_keys, offset=query_offset)
+
+                cache.append_context(context_keys, context_values, ctx_len)
+                cached_keys, cached_values = cache.fetch()
+                keys = mx.concatenate([cached_keys, noise_keys], axis=-2)
+                values = mx.concatenate([cached_values, noise_values], axis=-2)
+
+                output = scaled_dot_product_attention(
+                    queries,
+                    keys,
+                    values,
+                    cache=None,
+                    scale=self.scale,
+                    mask=None,
+                )
+            else:
+                # Use standard cache update
+                cache_offset = int(getattr(cache, "offset", 0) or 0)
+                query_offset = cache_offset + ctx_len
+
+                queries = self.rope(queries, offset=query_offset)
+                context_keys = self.rope(context_keys, offset=cache_offset)
+                noise_keys = self.rope(noise_keys, offset=query_offset)
+
+                keys = mx.concatenate([context_keys, noise_keys], axis=-2)
+                values = mx.concatenate([context_values, noise_values], axis=-2)
+                keys, values = cache.update_and_fetch(keys, values)
+
+                output = scaled_dot_product_attention(
+                    queries,
+                    keys,
+                    values,
+                    cache=cache,
+                    scale=self.scale,
+                    mask=None,
+                )
+        else:
+            # No cache - use standard attention path
+            queries = self.rope(queries, offset=ctx_len)
+            context_keys = self.rope(context_keys, offset=0)
+            noise_keys = self.rope(noise_keys, offset=ctx_len)
+
+            # Try to use optimized DFlash kernel if available
+            if hasattr(mx.fast, "dflash_cross_attention"):
+                output = mx.fast.dflash_cross_attention(
+                    queries,
+                    context_keys,
+                    context_values,
+                    noise_keys,
+                    noise_values,
+                    scale=self.scale,
+                )
+            else:
+                keys = mx.concatenate([context_keys, noise_keys], axis=-2)
+                values = mx.concatenate([context_values, noise_values], axis=-2)
+                output = scaled_dot_product_attention(
+                    queries,
+                    keys,
+                    values,
+                    cache=None,
+                    scale=self.scale,
+                    mask=None,
+                )
+
+        output = output.transpose(0, 2, 1, 3).reshape(batch, block_len, -1)
+        return self.o_proj(output)
+
+
+class Qwen3ContextOnlyCache:
+    """Qwen3-specific context-only KV cache with sliding window."""
+
+    def __init__(self, sink_size: int = 64, window_size: int = 1024):
+        self.sink_size = int(sink_size)
+        self.window_size = int(window_size)
+        self.keys: Optional[mx.array] = None
+        self.values: Optional[mx.array] = None
+        self.offset: int = 0
+
+    def append_context(
+        self,
+        context_keys: mx.array,
+        context_values: mx.array,
+        num_positions: int,
+    ) -> None:
+        if context_keys is None or context_values is None or int(num_positions) <= 0:
+            return
+
+        if self.keys is None:
+            self.keys = context_keys
+            self.values = context_values
+        else:
+            self.keys = mx.concatenate([self.keys, context_keys], axis=2)
+            self.values = mx.concatenate([self.values, context_values], axis=2)
+
+        self.offset += int(num_positions)
+        self._apply_window()
+
+    def _apply_window(self) -> None:
+        if self.keys is None or self.values is None:
+            return
+
+        cache_len = int(self.keys.shape[2])
+        max_len = self.sink_size + self.window_size
+
+        if cache_len <= max_len:
+            return
+
+        sink_k = self.keys[:, :, : self.sink_size, :]
+        sink_v = self.values[:, :, : self.sink_size, :]
+        window_k = self.keys[:, :, -self.window_size :, :]
+        window_v = self.values[:, :, -self.window_size :, :]
+
+        self.keys = mx.concatenate([sink_k, window_k], axis=2)
+        self.values = mx.concatenate([sink_v, window_v], axis=2)
+
+    def fetch(self) -> tuple[Optional[mx.array], Optional[mx.array]]:
+        return self.keys, self.values
+
+    def cache_length(self) -> int:
+        if self.keys is None:
+            return 0
+        return int(self.keys.shape[2])
+
+
+# =============================================================================
+# Qwen3 MLP (using mlx_lm's Qwen3 MLP)
+# =============================================================================
+
+
+class Qwen3DFlashMLP(Qwen3MLP, DFlashMLP):
+    """Qwen3-specific MLP using the mlx_lm Qwen3 MLP implementation."""
+
+    pass
+
+
+# =============================================================================
+# Qwen3 Decoder Layer
+# =============================================================================
+
+
+class Qwen3DFlashDecoderLayer(nn.Module, DFlashDecoderLayer):
+    """Qwen3-specific decoder layer."""
+
+    def __init__(self, args: DFlashArgs):
+        super().__init__()
+        self.self_attn = Qwen3DFlashAttention(args)
+        self.mlp = Qwen3DFlashMLP(args.hidden_size, args.intermediate_size)
+        self.input_layernorm = Qwen3DFlashNorm(args.hidden_size, eps=args.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3DFlashNorm(args.hidden_size, eps=args.rms_norm_eps)
+
+    def __call__(
+        self,
+        hidden_states: mx.array,
+        *,
+        target_hidden: mx.array,
+        cache: Optional[DFlashCache] = None,
+    ) -> mx.array:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states,
+            target_hidden=target_hidden,
+            cache=cache,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        return residual + hidden_states
+
+
+# =============================================================================
+# Qwen3 Full Model
+# =============================================================================
+
+
+class Qwen3DFlashModel(nn.Module, DFlashModel):
+    """
+    Qwen3-based DFlash draft model.
+
+    This model takes noise token embeddings (from the target model's embed_tokens)
+    and target hidden states, and produces draft logits for block-diffusion
+    speculative decoding.
+    """
+
+    def __init__(self, args: DFlashArgs):
+        super().__init__()
+        self.args = args
+        self.model_type = "dflash_qwen3"
+
+        # Create decoder layers
+        self.layers = [
+            Qwen3DFlashDecoderLayer(args) for _ in range(args.num_hidden_layers)
+        ]
+
+        # Get target layer IDs from config or build defaults
+        target_layer_ids = list(args.dflash_config.get("target_layer_ids") or [])
+        self.target_layer_ids = target_layer_ids or build_target_layer_ids(
+            args.num_target_layers,
+            args.num_hidden_layers,
+        )
+
+        # Output projection
+        self.norm = Qwen3DFlashNorm(args.hidden_size, eps=args.rms_norm_eps)
+
+        # Project concatenated target hidden states
+        self.fc = nn.Linear(
+            len(self.target_layer_ids) * args.hidden_size,
+            args.hidden_size,
+            bias=False,
+        )
+        self.hidden_norm = Qwen3DFlashNorm(args.hidden_size, eps=args.rms_norm_eps)
+
+        self.block_size = int(args.block_size)
+        self.mask_token_id = int(args.dflash_config.get("mask_token_id", 0) or 0)
+
+    def _project_target_hidden(self, target_hidden: mx.array) -> mx.array:
+        """Project and normalize target hidden states."""
+        return self.hidden_norm(self.fc(target_hidden))
+
+    def __call__(
+        self,
+        *,
+        noise_embedding: mx.array,
+        target_hidden: mx.array,
+        cache: Optional[list[Optional[DFlashCache]]] = None,
+    ) -> mx.array:
+        hidden_states = noise_embedding
+        projected_hidden = self._project_target_hidden(target_hidden)
+
+        if cache is None:
+            cache = [None] * len(self.layers)
+
+        for layer, layer_cache in zip(self.layers, cache, strict=True):
+            hidden_states = layer(
+                hidden_states,
+                target_hidden=projected_hidden,
+                cache=layer_cache,
+            )
+
+        return self.norm(hidden_states)
+
+    def sanitize(self, weights: dict[str, mx.array]) -> dict[str, mx.array]:
+        """Sanitize model weights after loading."""
+        return weights
+
+
+# =============================================================================
+# Register Qwen3 Architecture
+# =============================================================================
+
+
+qwen3_spec = ArchitectureSpec(
+    name="qwen3",
+    model_class=Qwen3DFlashModel,
+    attention_class=Qwen3DFlashAttention,
+    mlp_class=Qwen3DFlashMLP,
+    norm_class=Qwen3DFlashNorm,
+    rope_class=Qwen3DFlashRope,
+    cache_class=Qwen3ContextOnlyCache,
+    model_type_patterns=("qwen3", "qwen2.5", "qwen2", "kimi", "qwen3_moe"),
+)
+
+register_architecture(qwen3_spec)
\ No newline at end of file
diff --git a/dflash_mlx/generate.py b/dflash_mlx/generate.py
index a175252..880705a 100644
--- a/dflash_mlx/generate.py
+++ b/dflash_mlx/generate.py
@@ -16,16 +16,40 @@
 )
 
 
+# DFlash Draft Model Registry
+# Only includes official models from z-lab and RedHatAI
+# Format: "Target Model Name": "HF Repo ID"
+
 DRAFT_REGISTRY = {
+    # =================== Qwen3 Dense Models (z-lab) ===================
+    # Series with vocab_size=248320
     "Qwen3.5-4B": "z-lab/Qwen3.5-4B-DFlash",
     "Qwen3.5-9B": "z-lab/Qwen3.5-9B-DFlash",
     "Qwen3.5-27B": "z-lab/Qwen3.5-27B-DFlash",
-    "Qwen3.5-35B-A3B": "z-lab/Qwen3.5-35B-A3B-DFlash",
-    "Qwen3.6-35B-A3B": "z-lab/Qwen3.6-35B-A3B-DFlash",
+    # Series with vocab_size=151936 (base Qwen3)
     "Qwen3-4B": "z-lab/Qwen3-4B-DFlash-b16",
     "Qwen3-8B": "z-lab/Qwen3-8B-DFlash-b16",
+    # Qwen3.6 series
+    "Qwen3.6-27B": "z-lab/Qwen3.6-27B-DFlash",
+
+    # =================== Qwen3 MoE Models (z-lab) ===================
+    "Qwen3.5-35B-A3B": "z-lab/Qwen3.5-35B-A3B-DFlash",
+    "Qwen3.5-122B-A10B": "z-lab/Qwen3.5-122B-A10B-DFlash",
+    "Qwen3.6-35B-A3B": "z-lab/Qwen3.6-35B-A3B-DFlash",
+    "Qwen3-Coder-Next": "z-lab/Qwen3-Coder-Next-DFlash",
+    "Qwen3-Coder-30B-A3B": "z-lab/Qwen3-Coder-30B-A3B-DFlash",
+
+    # =================== Other z-lab Models ===================
+    "Kimi-K2.5": "z-lab/Kimi-K2.5-DFlash",
+    "Llama-3.1-8B-Instruct": "z-lab/LLaMA3.1-8B-Instruct-DFlash-UltraChat",
+    "GPT-OSS-20B": "z-lab/gpt-oss-20b-DFlash",
+    "GPT-OSS-120B": "z-lab/gpt-oss-120b-DFlash",
+
+    # =================== RedHatAI Models ===================
+    "Gemma-4-31B-it": "RedHatAI/gemma-4-31B-it-speculator.dflash",
 }
 
+
 _NORMALIZED_DRAFT_REGISTRY = {
     key.lower(): value for key, value in DRAFT_REGISTRY.items()
 }
@@ -178,4 +202,4 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/dflash_mlx/model.py b/dflash_mlx/model.py
index 5c9d601..ee5bfe7 100644
--- a/dflash_mlx/model.py
+++ b/dflash_mlx/model.py
@@ -2,43 +2,62 @@
 # MIT License — see LICENSE file
 # Based on DFlash (arXiv:2602.06036)
 
+"""
+DFlash Model Module
+
+This module provides backward compatibility by re-exporting from the
+new architecture system in dflash_mlx.archs.
+
+For new code, prefer importing directly from dflash_mlx.archs:
+    from dflash_mlx.archs import create_dflash_model, DFlashArgs, DFlashModel
+"""
+
+from __future__ import annotations
 
-from dataclasses import dataclass
 from typing import Any, Optional
 
 import mlx.core as mx
 import mlx.nn as nn
-from mlx_lm.models.base import scaled_dot_product_attention
-from mlx_lm.models.qwen3 import MLP
-from mlx_lm.models.rope_utils import initialize_rope
 
-def build_target_layer_ids(num_target_layers: int, num_draft_layers: int) -> list[int]:
-    if num_draft_layers <= 1:
-        return [num_target_layers // 2]
-    start = 1
-    end = num_target_layers - 3
-    span = end - start
-    return [
-        int(round(start + (index * span) / (num_draft_layers - 1)))
-        for index in range(num_draft_layers)
-    ]
+# Re-export everything from archs for backward compatibility
+from dflash_mlx.archs.base import (
+    DFlashArgs,
+    DFlashModel,
+    build_target_layer_ids,
+    extract_context_feature,
+)
 
+# Keep old class names as aliases for backward compatibility
+# These map to the Qwen3 implementation (the original/default)
+from dflash_mlx.archs.qwen3 import (
+    Qwen3DFlashModel as DFlashDraftModel,
+    Qwen3DFlashAttention as DFlashAttention,
+    Qwen3DFlashMLP as MLP,
+    Qwen3ContextOnlyCache as ContextOnlyDraftKVCache,
+)
 
-def extract_context_feature(
-    hidden_states: list[mx.array],
-    layer_ids: list[int],
-) -> mx.array:
-    selected = [hidden_states[layer_id + 1] for layer_id in layer_ids]
-    return mx.concatenate(selected, axis=-1)
+# Keep the old DFlashDraftModelArgs as alias for DFlashArgs
+DFlashDraftModelArgs = DFlashArgs
 
 
-class ContextOnlyDraftKVCache:
+class RecurrentRollbackCache:
+    """Legacy alias - use architecture-specific cache classes instead."""
+
     def __init__(self, sink_size: int = 64, window_size: int = 1024):
-        self.sink_size = int(sink_size)
-        self.window_size = int(window_size)
-        self.keys = None
-        self.values = None
-        self.offset = 0
+        from dflash_mlx.archs.qwen3 import Qwen3ContextOnlyCache
+        self._cache = Qwen3ContextOnlyCache(sink_size, window_size)
+
+    @property
+    def offset(self) -> int:
+        return self._cache.offset
+
+    @property
+    def keys(self) -> Optional[mx.array]:
+        return self._cache.keys
+
+    @property
+    def values(self) -> Optional[mx.array]:
+        return self._cache.values
 
     def append_context(
         self,
@@ -46,259 +65,27 @@ def append_context(
         context_values: mx.array,
         num_positions: int,
     ) -> None:
-        if context_keys is None or context_values is None or int(num_positions) <= 0:
-            return
-        if self.keys is None:
-            self.keys = context_keys
-            self.values = context_values
-        else:
-            self.keys = mx.concatenate([self.keys, context_keys], axis=2)
-            self.values = mx.concatenate([self.values, context_values], axis=2)
-        self.offset += int(num_positions)
-        self._apply_window()
-
-    def _apply_window(self) -> None:
-        if self.keys is None or self.values is None:
-            return
-        cache_len = int(self.keys.shape[2])
-        max_len = self.sink_size + self.window_size
-        if cache_len <= max_len:
-            return
-        sink_k = self.keys[:, :, : self.sink_size, :]
-        sink_v = self.values[:, :, : self.sink_size, :]
-        window_k = self.keys[:, :, -self.window_size :, :]
-        window_v = self.values[:, :, -self.window_size :, :]
-        self.keys = mx.concatenate([sink_k, window_k], axis=2)
-        self.values = mx.concatenate([sink_v, window_v], axis=2)
+        self._cache.append_context(context_keys, context_values, num_positions)
 
     def fetch(self) -> tuple[Optional[mx.array], Optional[mx.array]]:
-        return self.keys, self.values
+        return self._cache.fetch()
 
     def cache_length(self) -> int:
-        if self.keys is None:
-            return 0
-        return int(self.keys.shape[2])
-
-
-@dataclass
-class DFlashDraftModelArgs:
-    model_type: str
-    hidden_size: int
-    num_hidden_layers: int
-    intermediate_size: int
-    num_attention_heads: int
-    rms_norm_eps: float
-    vocab_size: int
-    num_key_value_heads: int
-    max_position_embeddings: int
-    rope_theta: float
-    head_dim: int
-    tie_word_embeddings: bool
-    num_target_layers: int
-    block_size: int
-    attention_bias: bool = False
-    attention_dropout: float = 0.0
-    rope_scaling: Optional[dict[str, Any]] = None
-    layer_types: tuple[str, ...] = ()
-    dflash_config: dict[str, Any] | None = None
-
-    @classmethod
-    def from_dict(cls, params: dict[str, Any]) -> "DFlashDraftModelArgs":
-        data = dict(params)
-        data["layer_types"] = tuple(data.get("layer_types") or ())
-        data["dflash_config"] = dict(data.get("dflash_config") or {})
-        return cls(
-            **{key: value for key, value in data.items() if key in cls.__annotations__}
-        )
-
-
-class DFlashAttention(nn.Module):
-    def __init__(self, args: DFlashDraftModelArgs):
-        super().__init__()
-        dim = args.hidden_size
-        self.n_heads = args.num_attention_heads
-        self.n_kv_heads = args.num_key_value_heads
-        self.head_dim = args.head_dim
-        self.scale = self.head_dim**-0.5
-        self.q_proj = nn.Linear(dim, self.n_heads * self.head_dim, bias=args.attention_bias)
-        self.k_proj = nn.Linear(dim, self.n_kv_heads * self.head_dim, bias=args.attention_bias)
-        self.v_proj = nn.Linear(dim, self.n_kv_heads * self.head_dim, bias=args.attention_bias)
-        self.o_proj = nn.Linear(self.n_heads * self.head_dim, dim, bias=args.attention_bias)
-        self.q_norm = nn.RMSNorm(self.head_dim, eps=args.rms_norm_eps)
-        self.k_norm = nn.RMSNorm(self.head_dim, eps=args.rms_norm_eps)
-        self.rope = initialize_rope(
-            self.head_dim,
-            base=args.rope_theta,
-            traditional=False,
-            scaling_config=args.rope_scaling,
-            max_position_embeddings=args.max_position_embeddings,
-        )
-
-    def __call__(
-        self,
-        hidden_states: mx.array,
-        *,
-        target_hidden: mx.array,
-        cache: Optional[Any] = None,
-    ) -> mx.array:
-        batch, block_len, _ = hidden_states.shape
-        ctx_len = int(target_hidden.shape[1])
-
-        queries = self.q_proj(hidden_states)
-        queries = self.q_norm(queries.reshape(batch, block_len, self.n_heads, -1)).transpose(
-            0, 2, 1, 3
-        )
-
-        # Fuse context and noise projections: 2 matmuls instead of 4
-        kv_states = mx.concatenate([target_hidden, hidden_states], axis=1)
-        all_keys = self.k_norm(
-            self.k_proj(kv_states).reshape(batch, ctx_len + block_len, self.n_kv_heads, -1)
-        ).transpose(0, 2, 1, 3)
-        all_values = self.v_proj(kv_states).reshape(
-            batch, ctx_len + block_len, self.n_kv_heads, -1
-        ).transpose(0, 2, 1, 3)
-        context_keys = all_keys[:, :, :ctx_len, :]
-        context_values = all_values[:, :, :ctx_len, :]
-        noise_keys = all_keys[:, :, ctx_len:, :]
-        noise_values = all_values[:, :, ctx_len:, :]
-
-        if cache is not None:
-            if isinstance(cache, ContextOnlyDraftKVCache):
-                cache_offset = int(cache.offset)
-                query_offset = cache_offset + ctx_len
-                queries = self.rope(queries, offset=query_offset)
-                context_keys = self.rope(context_keys, offset=cache_offset)
-                noise_keys = self.rope(noise_keys, offset=query_offset)
-
-                cache.append_context(context_keys, context_values, ctx_len)
-                cached_keys, cached_values = cache.fetch()
-                keys = mx.concatenate([cached_keys, noise_keys], axis=-2)
-                values = mx.concatenate([cached_values, noise_values], axis=-2)
-                output = scaled_dot_product_attention(
-                    queries,
-                    keys,
-                    values,
-                    cache=None,
-                    scale=self.scale,
-                    mask=None,
-                )
-            else:
-                cache_offset = int(getattr(cache, "offset", 0) or 0)
-                query_offset = cache_offset + ctx_len
-                queries = self.rope(queries, offset=query_offset)
-                context_keys = self.rope(context_keys, offset=cache_offset)
-                noise_keys = self.rope(noise_keys, offset=query_offset)
-
-                keys = mx.concatenate([context_keys, noise_keys], axis=-2)
-                values = mx.concatenate([context_values, noise_values], axis=-2)
-                keys, values = cache.update_and_fetch(keys, values)
-                output = scaled_dot_product_attention(
-                    queries,
-                    keys,
-                    values,
-                    cache=cache,
-                    scale=self.scale,
-                    mask=None,
-                )
-        else:
-            queries = self.rope(queries, offset=ctx_len)
-            context_keys = self.rope(context_keys, offset=0)
-            noise_keys = self.rope(noise_keys, offset=ctx_len)
-            if hasattr(mx.fast, "dflash_cross_attention"):
-                output = mx.fast.dflash_cross_attention(
-                    queries,
-                    context_keys,
-                    context_values,
-                    noise_keys,
-                    noise_values,
-                    scale=self.scale,
-                )
-            else:
-                keys = mx.concatenate([context_keys, noise_keys], axis=-2)
-                values = mx.concatenate([context_values, noise_values], axis=-2)
-                output = scaled_dot_product_attention(
-                    queries,
-                    keys,
-                    values,
-                    cache=None,
-                    scale=self.scale,
-                    mask=None,
-                )
-
-        output = output.transpose(0, 2, 1, 3).reshape(batch, block_len, -1)
-        return self.o_proj(output)
-
-
-class DFlashDecoderLayer(nn.Module):
-    def __init__(self, args: DFlashDraftModelArgs):
-        super().__init__()
-        self.self_attn = DFlashAttention(args)
-        self.mlp = MLP(args.hidden_size, args.intermediate_size)
-        self.input_layernorm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
-        self.post_attention_layernorm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
-
-    def __call__(
-        self,
-        hidden_states: mx.array,
-        *,
-        target_hidden: mx.array,
-        cache: Optional[Any] = None,
-    ) -> mx.array:
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        hidden_states = self.self_attn(
-            hidden_states,
-            target_hidden=target_hidden,
-            cache=cache,
-        )
-        hidden_states = residual + hidden_states
-
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        return residual + hidden_states
-
-
-class DFlashDraftModel(nn.Module):
-    def __init__(self, args: DFlashDraftModelArgs):
-        super().__init__()
-        self.args = args
-        self.model_type = "dflash_qwen3"
-        self.layers = [DFlashDecoderLayer(args) for _ in range(args.num_hidden_layers)]
-        target_layer_ids = list((args.dflash_config or {}).get("target_layer_ids") or ())
-        self.target_layer_ids = target_layer_ids or build_target_layer_ids(
-            args.num_target_layers,
-            args.num_hidden_layers,
-        )
-        self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
-        self.fc = nn.Linear(len(self.target_layer_ids) * args.hidden_size, args.hidden_size, bias=False)
-        self.hidden_norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
-        self.block_size = int(args.block_size)
-        self.mask_token_id = int((args.dflash_config or {}).get("mask_token_id", 0) or 0)
-
-    def _project_target_hidden(self, target_hidden: mx.array) -> mx.array:
-        return self.hidden_norm(self.fc(target_hidden))
-
-    def __call__(
-        self,
-        *,
-        noise_embedding: mx.array,
-        target_hidden: mx.array,
-        cache: Optional[list[Any]] = None,
-    ) -> mx.array:
-        hidden_states = noise_embedding
-        projected_hidden = self._project_target_hidden(target_hidden)
-
-        if cache is None:
-            cache = [None] * len(self.layers)
-
-        for layer, layer_cache in zip(self.layers, cache, strict=True):
-            hidden_states = layer(
-                hidden_states,
-                target_hidden=projected_hidden,
-                cache=layer_cache,
-            )
-        return self.norm(hidden_states)
-
-    def sanitize(self, weights: dict[str, mx.array]) -> dict[str, mx.array]:
-        return weights
+        return self._cache.cache_length()
+
+
+__all__ = [
+    # Main classes
+    "DFlashDraftModel",
+    "DFlashDraftModelArgs",
+    "DFlashArgs",
+    "DFlashModel",
+    "DFlashAttention",
+    "MLP",
+    # Cache
+    "ContextOnlyDraftKVCache",
+    "RecurrentRollbackCache",
+    # Utility functions
+    "build_target_layer_ids",
+    "extract_context_feature",
+]
\ No newline at end of file
diff --git a/dflash_mlx/runtime.py b/dflash_mlx/runtime.py
index 755221f..75765a7 100644
--- a/dflash_mlx/runtime.py
+++ b/dflash_mlx/runtime.py
@@ -24,11 +24,8 @@
 
 from dflash_mlx.adapter import detect_engine
 from dflash_mlx.draft_backend import make_draft_backend
-from dflash_mlx.model import (
-    DFlashDraftModel,
-    DFlashDraftModelArgs,
-    extract_context_feature,
-)
+from dflash_mlx.model import extract_context_feature
+from dflash_mlx.archs import create_dflash_model, DFlashArgs
 from dflash_mlx.recurrent_rollback_cache import RecurrentRollbackCache
 
 
@@ -40,7 +37,17 @@ def resolve_model_ref(model_ref: str | Path | None, *, kind: str) -> str:
 
 
 def _get_dflash_model_classes(config: dict[str, Any]):
-    return DFlashDraftModel, DFlashDraftModelArgs
+    """Get the appropriate DFlash model and args classes based on config.
+    
+    The architecture system automatically selects the right implementation
+    (Qwen3, Llama, etc.) based on the model_type in the config.
+    """
+    from dflash_mlx.archs.base import get_architecture_for_model_type
+    
+    model_type = config.get("model_type", "qwen3")
+    arch_spec = get_architecture_for_model_type(model_type)
+    
+    return arch_spec.model_class, DFlashArgs
 
 
 def _resolve_local_model_path(model_ref: str | Path) -> Path:

From 6f0aae86a393927d08826e23ffc658ffebb8dc95 Mon Sep 17 00:00:00 2001
From: "clandestine.eth" <96172957+0xClandestine@users.noreply.github.com>
Date: Sat, 25 Apr 2026 18:51:23 -0400
Subject: [PATCH 4/5] fix: remove problematic models from registry

Removed models with known issues:
- Qwen3.6-27B: Gated repo (requires HF auth)
- Kimi-K2.5: MLA not supported yet
- GPT-OSS models: Target architecture not in mlx-lm

Kept 12 verified working models.
---
 dflash_mlx/generate.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/dflash_mlx/generate.py b/dflash_mlx/generate.py
index 880705a..3c68ca9 100644
--- a/dflash_mlx/generate.py
+++ b/dflash_mlx/generate.py
@@ -17,38 +17,39 @@
 
 
 # DFlash Draft Model Registry
-# Only includes official models from z-lab and RedHatAI
+# Only includes official models from z-lab and RedHatAI that are verified working
 # Format: "Target Model Name": "HF Repo ID"
 
 DRAFT_REGISTRY = {
-    # =================== Qwen3 Dense Models (z-lab) ===================
-    # Series with vocab_size=248320
+    # =================== Qwen3.5 Dense Models (z-lab) ===================
     "Qwen3.5-4B": "z-lab/Qwen3.5-4B-DFlash",
     "Qwen3.5-9B": "z-lab/Qwen3.5-9B-DFlash",
     "Qwen3.5-27B": "z-lab/Qwen3.5-27B-DFlash",
-    # Series with vocab_size=151936 (base Qwen3)
+
+    # =================== Qwen3 Base Models (z-lab) ===================
     "Qwen3-4B": "z-lab/Qwen3-4B-DFlash-b16",
     "Qwen3-8B": "z-lab/Qwen3-8B-DFlash-b16",
-    # Qwen3.6 series
-    "Qwen3.6-27B": "z-lab/Qwen3.6-27B-DFlash",
 
-    # =================== Qwen3 MoE Models (z-lab) ===================
+    # =================== Qwen3.5 MoE Models (z-lab) ===================
     "Qwen3.5-35B-A3B": "z-lab/Qwen3.5-35B-A3B-DFlash",
     "Qwen3.5-122B-A10B": "z-lab/Qwen3.5-122B-A10B-DFlash",
     "Qwen3.6-35B-A3B": "z-lab/Qwen3.6-35B-A3B-DFlash",
     "Qwen3-Coder-Next": "z-lab/Qwen3-Coder-Next-DFlash",
     "Qwen3-Coder-30B-A3B": "z-lab/Qwen3-Coder-30B-A3B-DFlash",
 
-    # =================== Other z-lab Models ===================
-    "Kimi-K2.5": "z-lab/Kimi-K2.5-DFlash",
+    # =================== Llama Model (z-lab) ===================
     "Llama-3.1-8B-Instruct": "z-lab/LLaMA3.1-8B-Instruct-DFlash-UltraChat",
-    "GPT-OSS-20B": "z-lab/gpt-oss-20b-DFlash",
-    "GPT-OSS-120B": "z-lab/gpt-oss-120b-DFlash",
 
     # =================== RedHatAI Models ===================
     "Gemma-4-31B-it": "RedHatAI/gemma-4-31B-it-speculator.dflash",
 }
 
+# Models that are known to have issues and are temporarily disabled:
+# - Qwen3.6-27B: Gated repo (requires HF authentication/terms acceptance)
+# - Kimi-K2.5: MLA (Multi-head Latent Attention) not yet supported
+# - GPT-OSS models: Target model architecture not available in mlx-lm
+# - Gemma-4-31B-it: Speculator format with different weight structure
+
 
 _NORMALIZED_DRAFT_REGISTRY = {
     key.lower(): value for key, value in DRAFT_REGISTRY.items()

From 2e41ae8990b673818ff1c120b109e63d12c91d29 Mon Sep 17 00:00:00 2001
From: "clandestine.eth" <96172957+0xClandestine@users.noreply.github.com>
Date: Sun, 26 Apr 2026 18:43:22 -0400
Subject: [PATCH 5/5] perf: fuse Qwen3 no-cache attention with mx.compile
 (phew-mlx 0.1.6)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds make_qwen3_no_cache_attn() to kernels.py — a compiled factory that
fuses Q/K/V projection, RMSNorm, RoPE, GQA head expansion, and SDPA into
a single mx.compile trace. Wired into Qwen3DFlashAttention as a fast-exit
for the no-cache inference path (cache=None, no dflash_cross_attention).

Optimization discovered and verified by phew-mlx 0.1.6
(https://github.com/0xClandestine/phew):
  - Rule: compile_boundary + primitive_subst (SDPA pattern)
  - Verified: PASS fp32_to_fp32 atol=1e-05 rtol=1e-05, 5 seeds, 3 sizes
  - Speedup: ~1.2-1.35x on M-series hardware
---
 dflash_mlx/archs/qwen3.py | 20 ++++++++++
 dflash_mlx/kernels.py     | 77 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/dflash_mlx/archs/qwen3.py b/dflash_mlx/archs/qwen3.py
index 0859afc..da3cbb7 100644
--- a/dflash_mlx/archs/qwen3.py
+++ b/dflash_mlx/archs/qwen3.py
@@ -18,6 +18,7 @@
 from mlx_lm.models.qwen3 import MLP as Qwen3MLP
 from mlx_lm.models.rope_utils import initialize_rope
 
+from dflash_mlx.kernels import make_qwen3_no_cache_attn
 from dflash_mlx.archs.base import (
     DFlashArgs,
     DFlashAttention,
@@ -108,6 +109,20 @@ def __init__(self, args: DFlashArgs):
             scaling_config=args.rope_scaling,
         )
 
+        self._compiled_no_cache_attn = make_qwen3_no_cache_attn(
+            self.q_proj,
+            self.k_proj,
+            self.v_proj,
+            self.o_proj,
+            self.q_norm,
+            self.k_norm,
+            self.rope,
+            n_heads=self.n_heads,
+            n_kv_heads=self.n_kv_heads,
+            head_dim=self.head_dim,
+            scale=self.scale,
+        )
+
     def __call__(
         self,
         hidden_states: mx.array,
@@ -118,6 +133,11 @@ def __call__(
         batch, block_len, _ = hidden_states.shape
         ctx_len = int(target_hidden.shape[1])
 
+        # Phew-optimized path: fully compiled, ~1.2-1.35x vs unfused baseline.
+        # Fuses projections, RMSNorm, RoPE, GQA expand, and SDPA in one trace.
+        if cache is None and not hasattr(mx.fast, "dflash_cross_attention"):
+            return self._compiled_no_cache_attn(hidden_states, target_hidden, ctx_len)
+
         # Project and reshape queries
         queries = self.q_proj(hidden_states)
         queries = self.q_norm(
diff --git a/dflash_mlx/kernels.py b/dflash_mlx/kernels.py
index 275dfad..da10232 100644
--- a/dflash_mlx/kernels.py
+++ b/dflash_mlx/kernels.py
@@ -4,7 +4,7 @@
 
 from __future__ import annotations
 
-from typing import Optional
+from typing import Callable, Optional
 
 import mlx.core as mx
 
@@ -785,3 +785,78 @@ def batched_sdpa_2pass_exact(
         output_dtypes=[input_type],
     )
     return out
+
+
+def make_qwen3_no_cache_attn(
+    q_proj,
+    k_proj,
+    v_proj,
+    o_proj,
+    q_norm,
+    k_norm,
+    rope,
+    *,
+    n_heads: int,
+    n_kv_heads: int,
+    head_dim: int,
+    scale: float,
+) -> Callable:
+    """Return a compiled Qwen3 no-cache cross-attention function.
+
+    Fuses Q/K/V projection, RMSNorm, RoPE, GQA head expansion, and SDPA into
+    a single mx.compile trace. Verified by phew-mlx 0.1.6 to give ~1.2-1.35x
+    vs the unfused path on M-series hardware.
+
+    The returned callable has signature::
+
+        fn(hidden_states, target_hidden, ctx_len) -> mx.array
+
+    where ctx_len is a Python int treated as a compile-time constant; a
+    different value triggers a retrace (acceptable for fixed-length inference).
+    """
+    rep = n_heads // n_kv_heads
+
+    @mx.compile
+    def _fwd(
+        hidden_states: mx.array,  # (B, BL, D)
+        target_hidden: mx.array,  # (B, CL, D)
+        ctx_len: int,
+    ) -> mx.array:
+        B, BL, _ = hidden_states.shape
+        CL = ctx_len
+
+        # Q: project → reshape → RMSNorm → transpose → RoPE
+        q = q_proj(hidden_states)
+        q = q_norm(q.reshape(B, BL, n_heads, head_dim)).transpose(0, 2, 1, 3)
+        q = rope(q, offset=CL)
+
+        # Fused KV: single concat → two projections → norm/identity → transpose
+        kv_in = mx.concatenate([target_hidden, hidden_states], axis=1)
+        k_all = k_norm(
+            k_proj(kv_in).reshape(B, CL + BL, n_kv_heads, head_dim)
+        ).transpose(0, 2, 1, 3)
+        v_all = v_proj(kv_in).reshape(B, CL + BL, n_kv_heads, head_dim).transpose(
+            0, 2, 1, 3
+        )
+
+        # Split context / noise
+        ck, nk = k_all[:, :, :CL, :], k_all[:, :, CL:, :]
+        cv, nv = v_all[:, :, :CL, :], v_all[:, :, CL:, :]
+
+        # RoPE on keys
+        ck = rope(ck, offset=0)
+        nk = rope(nk, offset=CL)
+
+        # Concatenate and GQA head expansion
+        keys = mx.concatenate([ck, nk], axis=2)
+        values = mx.concatenate([cv, nv], axis=2)
+        if rep > 1:
+            keys = mx.repeat(keys, rep, axis=1)
+            values = mx.repeat(values, rep, axis=1)
+
+        # Flash attention + output projection
+        out = mx.fast.scaled_dot_product_attention(q, keys, values, scale=scale)
+        out = out.transpose(0, 2, 1, 3).reshape(B, BL, -1)
+        return o_proj(out)
+
+    return _fwd