diff --git a/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py b/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py
index 5d04d3369f..8feba60128 100644
--- a/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py
+++ b/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py
@@ -1269,6 +1269,10 @@ def validate_renderer_args(self):
             renderer_args_set.append(f"renderer.reasoning_parser={self.renderer.reasoning_parser!r}")
         if self.renderer.pool_size is not None:
             renderer_args_set.append(f"renderer.pool_size={self.renderer.pool_size!r}")
+        if self.renderer.preserve_all_thinking:
+            renderer_args_set.append("renderer.preserve_all_thinking=true")
+        if self.renderer.preserve_thinking_between_tool_calls:
+            renderer_args_set.append("renderer.preserve_thinking_between_tool_calls=true")
 
         if renderer_args_set:
             raise ValueError(
diff --git a/packages/prime-rl-configs/src/prime_rl/configs/rl.py b/packages/prime-rl-configs/src/prime_rl/configs/rl.py
index a160af2c9f..8d5f087da7 100644
--- a/packages/prime-rl-configs/src/prime_rl/configs/rl.py
+++ b/packages/prime-rl-configs/src/prime_rl/configs/rl.py
@@ -791,6 +791,19 @@ def auto_setup_router_replay(self):
                 )
         return self
 
+    @model_validator(mode="after")
+    def validate_router_replay_without_kv_offload(self):
+        if (
+            self.trainer.enable_router_replay
+            and self.inference is not None
+            and self.inference.kv_cache_offload is not None
+        ):
+            raise ValueError(
+                "Router replay with inference.kv_cache_offload is not supported. "
+                "External KV cache hits do not carry routed-expert decisions."
+            )
+        return self
+
     @model_validator(mode="after")
     def auto_setup_deployment(self):
         if self.deployment.type == "single_node":  # single-node
diff --git a/packages/prime-rl-configs/src/prime_rl/configs/shared.py b/packages/prime-rl-configs/src/prime_rl/configs/shared.py
index d26c33d9a9..1651e17b9f 100644
--- a/packages/prime-rl-configs/src/prime_rl/configs/shared.py
+++ b/packages/prime-rl-configs/src/prime_rl/configs/shared.py
@@ -186,6 +186,26 @@ class RendererConfig(BaseConfig):
         ),
     ] = None
 
+    preserve_all_thinking: Annotated[
+        bool,
+        Field(
+            description=(
+                "Forward preserve_all_thinking to the renderer client. When true, "
+                "past-assistant reasoning_content is re-emitted on subsequent renders."
+            ),
+        ),
+    ] = False
+
+    preserve_thinking_between_tool_calls: Annotated[
+        bool,
+        Field(
+            description=(
+                "Forward preserve_thinking_between_tool_calls to the renderer client. "
+                "This preserves thinking only inside the active assistant/tool block."
+            ),
+        ),
+    ] = False
+
 
 class ElasticConfig(BaseConfig):
     """Configures elastic inference pool with DNS-based service discovery.
diff --git a/pyproject.toml b/pyproject.toml
index d9b5468fa0..a54daaa387 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = [
     "torchaudio",
     "torchdata>=0.11.0",
     "transformers",
-    "vllm>=0.20.2",
+    "vllm",
     "wandb>=0.26.1",
     "ring-flash-attn>=0.1.8",
     "prime>=0.6.4",
@@ -36,6 +36,7 @@ dependencies = [
     "tilelang>=0.1.8",
     "flash-linear-attention",
     "nvidia-ml-py>=12.575.51",
+    "pybase64>=1.4.2",
 ]
 
 [project.scripts]
@@ -130,6 +131,7 @@ override-dependencies = [
 [tool.uv.exclude-newer-package]
 # we want latest vllm, remove next patch
 vllm = false
+tokenspeed-mla = false
 flash_attn_3 = false
 # Self-vendored packages on our primeintellect index
 reverse-text = false
@@ -166,15 +168,15 @@ prime-rl-configs = { workspace = true }
 torch = { index = "pytorch-cu128" }
 torchvision = { index = "pytorch-cu128" }
 torchaudio = { index = "pytorch-cu128" }
-verifiers = { git = "https://github.com/PrimeIntellect-ai/verifiers.git", rev = "aa428f3" }
+verifiers = { git = "https://github.com/PrimeIntellect-ai/verifiers", rev = "3708ede" }
 torchtitan = { git = "https://github.com/pytorch/torchtitan", rev = "a1fdd7e" }
 dion = { git = "https://github.com/samsja/dion.git", rev = "d891eeb" }
 transformers = { git = "https://github.com/huggingface/transformers.git", rev = "c1c3424" }
 flash-attn-4 = { git = "https://github.com/Dao-AILab/flash-attention.git", subdirectory = "flash_attn/cute", rev = "96bd151" }
 pydantic-config = { git = "https://github.com/samsja/pydantic_config.git", branch = "main" }
-vllm-router = { url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.22/vllm_router-0.1.22-cp38-abi3-manylinux_2_28_x86_64.whl" }
+vllm-router = { url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.25/vllm_router-0.1.25-cp38-abi3-manylinux_2_28_x86_64.whl" }
 vllm = [
-    { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_x86_64.whl", marker = "platform_machine == 'x86_64'" },
+    { url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/vllm-0.20.2rc1.dev354+g24337fb86.cu129-cp38-abi3-manylinux_2_34_x86_64.whl", marker = "platform_machine == 'x86_64'" },
     { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_aarch64.whl", marker = "platform_machine == 'aarch64'" },
 ]
 reverse-text = { index = "primeintellect" }
diff --git a/src/prime_rl/inference/patches.py b/src/prime_rl/inference/patches.py
index 974aed5f82..780086be08 100644
--- a/src/prime_rl/inference/patches.py
+++ b/src/prime_rl/inference/patches.py
@@ -19,6 +19,51 @@ def transformers_v5_compat():
     monkey_patch_deep_gemm_silu_mul_quant_int64()
     monkey_patch_dp_engine_core_pause_resume_deadlock()
     monkey_patch_vllm_layerwise_reload_alias_buffers()
+    monkey_patch_return_routed_experts_with_nixl_connector()
+
+
+def monkey_patch_return_routed_experts_with_nixl_connector():
+    from vllm import envs
+    from vllm.config.vllm import VllmConfig
+    from vllm.logger import init_logger
+
+    logger = init_logger(__name__)
+    original_post_init = VllmConfig.__post_init__
+
+    if getattr(original_post_init, "_prime_rl_allows_nixl_routed_experts", False):
+        return
+
+    def _is_nixl_routed_experts_pd_config(config: VllmConfig) -> bool:
+        kv_transfer_config = config.kv_transfer_config
+        return (
+            config.model_config is not None
+            and config.model_config.enable_return_routed_experts
+            and kv_transfer_config is not None
+            and kv_transfer_config.kv_connector == "NixlConnector"
+            and kv_transfer_config.is_kv_transfer_instance
+        )
+
+    def _post_init(config: VllmConfig):
+        if not _is_nixl_routed_experts_pd_config(config):
+            return original_post_init(config)
+
+        if config.parallel_config.pipeline_parallel_size > 1:
+            raise ValueError("--enable-return-routed-experts is incompatible with pipeline parallelism (PP > 1).")
+        if envs.VLLM_USE_V2_MODEL_RUNNER:
+            raise ValueError("VLLM_USE_V2_MODEL_RUNNER does not yet support: routed experts capture")
+
+        # vLLM rejects every KV connector, but our P/D path uses NIXL and
+        # stitches prefill/decode routed experts in the router. CPU KV offload
+        # remains rejected by prime-rl config validation.
+        config.model_config.enable_return_routed_experts = False
+        try:
+            return original_post_init(config)
+        finally:
+            config.model_config.enable_return_routed_experts = True
+
+    _post_init._prime_rl_allows_nixl_routed_experts = True
+    VllmConfig.__post_init__ = _post_init
+    logger.warning("Enabled vLLM routed-experts capture with NIXL connector patch.")
 
 
 def monkey_patch_vllm_layerwise_reload_alias_buffers():
@@ -897,9 +942,9 @@ def monkey_patch_dp_engine_core_pause_resume_deadlock():
     - on resume, wake every DP rank and force an immediate global unfinished
       sync instead of waiting for the normal 32-step cadence
 
-    This keeps the upstream pause-side fix from
-    https://github.com/vllm-project/vllm/pull/37024 and extends it with the
-    resume-side wave-state fix.
+    This also bypasses vLLM's two-phase DP pause implementation
+    (https://github.com/vllm-project/vllm/pull/39366), which makes resume
+    reject states that our weight-update flow can validly hit.
     """
     from vllm.config import ParallelConfig
     from vllm.v1.core.sched.interface import PauseState
@@ -909,7 +954,8 @@ def monkey_patch_dp_engine_core_pause_resume_deadlock():
 
     _base_add_request = EngineCore.add_request
     _base_handle_client_request = EngineCoreProc._handle_client_request
-    _base_resume_scheduler = DPEngineCoreProc.resume_scheduler
+    _base_pause_complete = EngineCoreProc._pause_complete
+    _base_resume_scheduler = EngineCoreProc.resume_scheduler
 
     def _patched_add_request(self, request: Request, request_wave: int = 0):
         _base_add_request(self, request, request_wave)
@@ -930,8 +976,15 @@ def _patched_handle_client_request(self, request_type, request):
         else:
             _base_handle_client_request(self, request_type, request)
 
+    def _patched_pause_complete(self) -> bool:
+        self.pending_pause = False
+        self.ignore_start_dp_wave = False
+        return _base_pause_complete(self)
+
     def _patched_resume_scheduler(self):
         was_paused = self.scheduler.pause_state != PauseState.UNPAUSED
+        self.pending_pause = False
+        self.ignore_start_dp_wave = False
         _base_resume_scheduler(self)
         if was_paused:
             self.engines_running = True
@@ -948,6 +1001,7 @@ def _patched_has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
 
     DPEngineCoreProc.add_request = _patched_add_request
     DPEngineCoreProc._handle_client_request = _patched_handle_client_request
+    DPEngineCoreProc._pause_complete = _patched_pause_complete
     DPEngineCoreProc.resume_scheduler = _patched_resume_scheduler
     DPEngineCoreProc._has_global_unfinished_reqs = _patched_has_global_unfinished_reqs
 
diff --git a/src/prime_rl/inference/vllm/routed_experts.py b/src/prime_rl/inference/vllm/routed_experts.py
new file mode 100644
index 0000000000..cad97e8574
--- /dev/null
+++ b/src/prime_rl/inference/vllm/routed_experts.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from collections.abc import AsyncIterator
+from typing import Any
+
+import numpy as np
+import pybase64
+from vllm.outputs import RequestOutput
+
+
+def serialize_routed_experts(routed_experts: Any) -> dict[str, Any] | None:
+    if routed_experts is None:
+        return None
+
+    array = np.asarray(routed_experts)
+    assert array.ndim == 3
+    assert np.issubdtype(array.dtype, np.integer)
+    if array.size:
+        assert array.min() >= 0
+        assert array.max() <= np.iinfo(np.uint8).max
+
+    compact = np.ascontiguousarray(array.astype(np.uint8, copy=False))
+    return {
+        "data": pybase64.b64encode(memoryview(compact)).decode("ascii"),
+        "shape": list(compact.shape),
+    }
+
+
+class RoutedExpertsCapture:
+    def __init__(self, generator: AsyncIterator[RequestOutput]):
+        self._generator = generator
+        self.routed_experts: dict[int, dict[str, Any]] = {}
+
+    async def __aiter__(self):
+        async for request_output in self._generator:
+            for output in request_output.outputs:
+                encoded = serialize_routed_experts(getattr(output, "routed_experts", None))
+                if encoded is not None:
+                    self.routed_experts[output.index] = encoded
+            yield request_output
diff --git a/src/prime_rl/inference/vllm/serving_chat_with_tokens.py b/src/prime_rl/inference/vllm/serving_chat_with_tokens.py
index fae9465fbe..c78a76bde8 100644
--- a/src/prime_rl/inference/vllm/serving_chat_with_tokens.py
+++ b/src/prime_rl/inference/vllm/serving_chat_with_tokens.py
@@ -14,22 +14,11 @@
 from vllm.reasoning import ReasoningParser
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 
-from prime_rl.inference.vllm.serving_tokens import _RoutedExpertsCaptureBase
+from prime_rl.inference.vllm.routed_experts import RoutedExpertsCapture
 
 logger = init_logger(__name__)
 
 
-class _RoutedExpertsCapture(_RoutedExpertsCaptureBase):
-    """Chat-endpoint variant: mutates choices in-place because
-    ``ChatCompletionResponseChoice`` is ``extra='allow'``, so an extra
-    ``routed_experts`` attribute survives serialization."""
-
-    def post_process(self, response: ChatCompletionResponse) -> None:
-        for choice in response.choices:
-            if choice.index in self.routed_experts:
-                choice.routed_experts = self.routed_experts[choice.index]
-
-
 class ChatCompletionRequestWithTokens(ChatCompletionRequest):
     field_names: ClassVar[Optional[set[str]]] = None
     tokens: list[int] = Field(description=("Prompt tokens to use for the request."))
@@ -55,11 +44,10 @@ async def chat_completion_full_generator(
         # 1. We create a custom generator that encapsulates the original result_generator in self._generator
         # 2. We override it's __aiter__ method to also capture the routed experts as an extra field in ChatCompletionResponse.choices
         # 3. We override the full_generator method to use the custom generator instead of the original one if expert routing is enabled
+        capture = None
         if self.model_config.enable_return_routed_experts:
-            capture = _RoutedExpertsCapture(result_generator)
+            capture = RoutedExpertsCapture(result_generator)
             result_generator = capture
-        else:
-            capture = None
 
         response = await super().chat_completion_full_generator(
             request,
@@ -72,8 +60,10 @@ async def chat_completion_full_generator(
             reasoning_parser,
         )
 
-        if capture and isinstance(response, ChatCompletionResponse):
-            capture.post_process(response)
+        if capture is not None and isinstance(response, ChatCompletionResponse):
+            for choice in response.choices:
+                if choice.index in capture.routed_experts:
+                    choice.routed_experts = capture.routed_experts[choice.index]
 
         return response
 
diff --git a/src/prime_rl/inference/vllm/serving_tokens.py b/src/prime_rl/inference/vllm/serving_tokens.py
index 359df83d11..789b361c19 100644
--- a/src/prime_rl/inference/vllm/serving_tokens.py
+++ b/src/prime_rl/inference/vllm/serving_tokens.py
@@ -10,9 +10,9 @@
    header and forwarded to ``engine_client.generate``. The DP-replicated
    inference servers prime-RL runs need this to target a specific replica.
 
-2. ``routed_experts`` per-token export — when the engine emits routing
-   decisions (``enable_return_routed_experts``), surface them on each choice.
-   This is what the trainer's router-replay path consumes.
+2. Compact ``routed_experts`` export — when the engine emits routing
+   decisions, surface them as base64 raw-byte payloads without requiring a vLLM
+   source fork.
 
 3. Server-side ``max_tokens`` defaulting — ``ServingTokens`` hands the
    client-supplied ``SamplingParams`` to the engine verbatim, and
@@ -30,13 +30,11 @@
 
 from __future__ import annotations
 
-import base64
 from collections.abc import AsyncGenerator
 from functools import cached_property
+from typing import Any
 
-import numpy as np
 from fastapi import Request
-from pydantic import Field
 from vllm.entrypoints.openai.engine.protocol import ErrorResponse, RequestResponseMetadata
 from vllm.entrypoints.serve.disagg.protocol import (
     GenerateRequest,
@@ -48,64 +46,29 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 
+from prime_rl.inference.vllm.routed_experts import RoutedExpertsCapture
+
 
 class PrimeRlGenerateResponseChoice(GenerateResponseChoice):
-    routed_experts: dict | None = Field(
-        default=None,
-        description=(
-            "Per-token expert routing decisions (base85-encoded int32 array + shape). "
-            "Populated only when the engine was launched with "
-            "``enable_return_routed_experts=True``; otherwise ``None``."
-        ),
-    )
+    routed_experts: dict[str, Any] | None = None
 
 
 class PrimeRlGenerateResponse(GenerateResponse):
     choices: list[PrimeRlGenerateResponseChoice]
 
 
-def encode_routed_experts(arr: np.ndarray) -> dict:
-    return {
-        "data": base64.b85encode(arr.tobytes()).decode("ascii"),
-        "shape": list(arr.shape),
-    }
-
-
-class _RoutedExpertsCaptureBase:
-    """Wraps the engine result generator and accumulates a
-    ``{output_index: encoded_experts}`` map as outputs stream. Subclasses
-    implement ``post_process`` to fold the captured map into the response
-    in whatever shape the endpoint returns (in-place vs rebuilt)."""
-
-    def __init__(self, generator: AsyncGenerator[RequestOutput, None]):
-        self._generator = generator
-        self.routed_experts: dict[int, dict] = {}
-
-    async def __aiter__(self):
-        async for request_output in self._generator:
-            for output in request_output.outputs:
-                if output.routed_experts is not None:
-                    self.routed_experts[output.index] = encode_routed_experts(output.routed_experts)
-            yield request_output
-
-
-class _RoutedExpertsCapture(_RoutedExpertsCaptureBase):
-    """Generate-endpoint variant: rebuilds the response with
-    ``PrimeRlGenerateResponseChoice`` because upstream's
-    ``GenerateResponseChoice`` isn't ``extra='allow'``, so an attribute
-    set after construction wouldn't survive serialization."""
-
+class _GenerateRoutedExpertsCapture(RoutedExpertsCapture):
     def post_process(self, response: GenerateResponse) -> PrimeRlGenerateResponse:
-        new_choices = [
+        choices = [
             PrimeRlGenerateResponseChoice(
-                **choice.model_dump(),
+                **choice.model_dump(exclude={"routed_experts"}),
                 routed_experts=self.routed_experts.get(choice.index),
             )
             for choice in response.choices
         ]
         return PrimeRlGenerateResponse(
             request_id=response.request_id,
-            choices=new_choices,
+            choices=choices,
             prompt_logprobs=response.prompt_logprobs,
             kv_transfer_params=response.kv_transfer_params,
         )
@@ -135,7 +98,7 @@ async def _client_set_max_tokens(raw_request: Request | None) -> bool:
 
 
 class PrimeRlServingTokens(ServingTokens):
-    """ServingTokens + DP-rank routing + routed_experts export + max_tokens defaulting."""
+    """ServingTokens + DP-rank routing + compact routed experts + max_tokens defaulting."""
 
     @cached_property
     def _max_tokens_defaults(self) -> tuple[dict, int | None]:
@@ -306,10 +269,10 @@ async def serve_tokens_full_generator(  # type: ignore[override]
         # encoded experts surface in the JSON. Skipping the wrapper when the
         # engine isn't producing routed experts keeps us a no-op subclass on
         # the common path.
-        capture: _RoutedExpertsCapture | None = None
+        capture: _GenerateRoutedExpertsCapture | None = None
         if self.model_config.enable_return_routed_experts:
-            capture = _RoutedExpertsCapture(result_generator)
-            result_generator = capture  # type: ignore[assignment]
+            capture = _GenerateRoutedExpertsCapture(result_generator)
+            result_generator = capture
 
         response = await super().serve_tokens_full_generator(
             request, result_generator, request_id, model_name, request_metadata
diff --git a/src/prime_rl/orchestrator/orchestrator.py b/src/prime_rl/orchestrator/orchestrator.py
index bc1128ebc7..67ef7bfa1d 100644
--- a/src/prime_rl/orchestrator/orchestrator.py
+++ b/src/prime_rl/orchestrator/orchestrator.py
@@ -926,6 +926,8 @@ async def setup_rollout_inference_pool(
             renderer=config.renderer.name,
             tool_parser=config.renderer.tool_parser,
             reasoning_parser=config.renderer.reasoning_parser,
+            preserve_all_thinking=config.renderer.preserve_all_thinking,
+            preserve_thinking_between_tool_calls=config.renderer.preserve_thinking_between_tool_calls,
         )
         logger.info(f"Initialized {type(renderer).__name__} for {config.model.name}")
         inference_pool = await setup_inference_pool(
@@ -937,6 +939,8 @@ async def setup_rollout_inference_pool(
             tool_parser=config.renderer.tool_parser,
             reasoning_parser=config.renderer.reasoning_parser,
             renderer_pool_size=config.renderer.pool_size,
+            preserve_all_thinking=config.renderer.preserve_all_thinking,
+            preserve_thinking_between_tool_calls=config.renderer.preserve_thinking_between_tool_calls,
         )
         logger.info("Using direct renderer rollout client")
         return renderer, inference_pool
diff --git a/src/prime_rl/orchestrator/trajectories.py b/src/prime_rl/orchestrator/trajectories.py
index 3a45ee9ada..4cd6f5643c 100644
--- a/src/prime_rl/orchestrator/trajectories.py
+++ b/src/prime_rl/orchestrator/trajectories.py
@@ -6,12 +6,15 @@
 from pathlib import Path
 from typing import Any
 
+import numpy as np
+import pybase64
 import torch
 import verifiers as vf
 from PIL import Image
 from transformers.tokenization_utils import PreTrainedTokenizer
 
 from prime_rl.transport import TrainingSample
+from prime_rl.transport.types import RoutedExperts
 from prime_rl.utils.chat_template import (
     common_prefix_len,
     deserialize_tool_calls,
@@ -25,25 +28,53 @@
 # primitives are immutable. pixel_values/image_grid_thw are not mutated after creation.
 
 
+def _decode_routed_experts(payload: dict[str, Any] | None) -> np.ndarray | None:
+    if payload is None:
+        return None
+    shape = [int(dim) for dim in payload["shape"]]
+    decoded = pybase64.b64decode_as_bytearray(payload["data"])
+    expected_size = int(np.prod(shape, dtype=np.int64))
+    assert len(decoded) == expected_size, (len(decoded), expected_size, shape)
+    routed_experts = np.frombuffer(decoded, dtype=np.uint8).reshape(shape)
+    assert routed_experts.ndim == 3
+    return routed_experts
+
+
 def _align_routed_experts(
-    routed_experts: list[list[list[int]]] | None,
+    routed_experts: np.ndarray | None,
     expected_len: int,
-) -> list[list[list[int]]] | None:
+) -> np.ndarray | None:
     """Align routed_experts length with the expected token count.
 
     VLLM's capturer uses `num_tokens - 1` slot mappings because the final
     generated token was never fed as input to a forward pass and has no
     routing decision. Append zero-filled entries for the missing positions.
     """
-    if routed_experts is None or not routed_experts:
+    if routed_experts is None:
         return routed_experts
-    deficit = expected_len - len(routed_experts)
+    assert routed_experts.ndim == 3
+    if routed_experts.shape[0] > expected_len:
+        return np.ascontiguousarray(routed_experts[:expected_len])
+    deficit = expected_len - routed_experts.shape[0]
     if deficit <= 0:
         return routed_experts
-    num_layers = len(routed_experts[0])
-    topk = len(routed_experts[0][0])
-    zero_entry = [[0] * topk for _ in range(num_layers)]
-    return routed_experts + [zero_entry for _ in range(deficit)]
+    padding = np.zeros((deficit, routed_experts.shape[1], routed_experts.shape[2]), dtype=routed_experts.dtype)
+    return np.concatenate((routed_experts, padding), axis=0)
+
+
+def _pack_routed_experts(routed_experts: np.ndarray | None) -> RoutedExperts | None:
+    if routed_experts is None:
+        return None
+    routed_experts = np.ascontiguousarray(routed_experts)
+    return RoutedExperts(
+        data=routed_experts.tobytes(),
+        shape=list(routed_experts.shape),
+        dtype=str(routed_experts.dtype),
+    )
+
+
+def _unpack_routed_experts(routed_experts: RoutedExperts) -> np.ndarray:
+    return np.frombuffer(routed_experts.data, dtype=np.dtype(routed_experts.dtype)).reshape(routed_experts.shape).copy()
 
 
 def _common_prefix_len(a: list[int], b: list[int]) -> int:
@@ -296,13 +327,14 @@ def interleave_rollout(
     def prepare_step_tokens(step: vf.TrajectoryStep, step_idx: int) -> dict[str, Any] | None:
         tokens = step["tokens"]
         if tokens is not None:
+            routed_experts = _decode_routed_experts(tokens.get("routed_experts"))
             return {
                 "prompt_ids": list(tokens["prompt_ids"]),
                 "prompt_mask": [bool(i) for i in tokens["prompt_mask"]],
                 "completion_ids": list(tokens["completion_ids"]),
                 "completion_mask": [bool(i) for i in tokens["completion_mask"]],
                 "completion_logprobs": list(tokens["completion_logprobs"]),
-                "routed_experts": tokens.get("routed_experts"),
+                "routed_experts": routed_experts,
             }
 
         logger.warning(f"Missing rollout tokens for example {output['example_id']} step {step_idx}.")
@@ -328,7 +360,7 @@ def make_sample(tokens: dict[str, Any]) -> TrainingSample:
             len(tokens["prompt_ids"]) + len(tokens["completion_ids"]),
         )
         prompt_ids = list(tokens["prompt_ids"])
-        return TrainingSample(
+        sample = TrainingSample(
             prompt_ids=prompt_ids,
             prompt_mask=[bool(i) for i in tokens["prompt_mask"]],
             completion_ids=completion_ids,
@@ -337,9 +369,10 @@ def make_sample(tokens: dict[str, Any]) -> TrainingSample:
             completion_temperatures=[temperature] * len(completion_ids),
             teacher_logprobs=None,
             advantage=None,
-            routed_experts=routed_experts,
+            routed_experts=_pack_routed_experts(routed_experts),
             mm_token_type_ids=None,
         )
+        return sample
 
     def extend_sample(sample: TrainingSample, prefix_len: int, step_idx: int) -> None:
         """Extend an existing sample with a new trajectory step (extension property holds)."""
@@ -364,15 +397,17 @@ def extend_sample(sample: TrainingSample, prefix_len: int, step_idx: int) -> Non
 
         if tokens.get("routed_experts") is not None and sample.routed_experts is not None:
             step_routed = tokens["routed_experts"]
+            sample_routed_experts = _unpack_routed_experts(sample.routed_experts)
             # The previous step's last routing entry was zero-padded by _align_routed_experts
             # (vLLM only captures num_tokens-1 routings per request). This step actually
             # processed that boundary token as part of its prompt, so replace the zero-fill
             # with the real routing decision before appending new entries.
-            if prefix_len > 0 and prefix_len <= len(step_routed):
-                sample.routed_experts[prefix_len - 1] = step_routed[prefix_len - 1]
-            sample.routed_experts.extend(step_routed[prefix_len:])
+            if prefix_len > 0 and prefix_len <= step_routed.shape[0]:
+                sample_routed_experts[prefix_len - 1] = step_routed[prefix_len - 1]
+            sample_routed_experts = np.concatenate((sample_routed_experts, step_routed[prefix_len:]), axis=0)
             expected_len = len(sample.prompt_ids) + len(sample.completion_ids)
-            sample.routed_experts = _align_routed_experts(sample.routed_experts, expected_len)
+            sample_routed_experts = _align_routed_experts(sample_routed_experts, expected_len)
+            sample.routed_experts = _pack_routed_experts(sample_routed_experts)
 
     # Track [prefix_tokens, sample, last_step_idx] per active sample
     active_samples: list[tuple[list[int], TrainingSample, int]] = []
diff --git a/src/prime_rl/trainer/batch.py b/src/prime_rl/trainer/batch.py
index 662df36a80..ca248a43d4 100644
--- a/src/prime_rl/trainer/batch.py
+++ b/src/prime_rl/trainer/batch.py
@@ -1,6 +1,52 @@
 import copy
 
-from prime_rl.transport.types import MicroBatch, TrainingSample
+from prime_rl.transport.types import MicroBatch, RoutedExperts, TrainingSample
+
+ROUTED_EXPERTS_DTYPE_ITEMSIZE = {
+    "uint8": 1,
+    "int16": 2,
+    "int32": 4,
+}
+
+
+def _copy_routed_experts(routed_experts: RoutedExperts) -> RoutedExperts:
+    return RoutedExperts(
+        data=routed_experts.data,
+        shape=list(routed_experts.shape),
+        dtype=routed_experts.dtype,
+    )
+
+
+def _routed_experts_row_size(routed_experts: RoutedExperts) -> int:
+    return routed_experts.shape[1] * routed_experts.shape[2] * ROUTED_EXPERTS_DTYPE_ITEMSIZE[routed_experts.dtype]
+
+
+def _slice_routed_experts(routed_experts: RoutedExperts, seq_len: int) -> RoutedExperts:
+    row_size = _routed_experts_row_size(routed_experts)
+    return RoutedExperts(
+        data=routed_experts.data[: seq_len * row_size],
+        shape=[seq_len, routed_experts.shape[1], routed_experts.shape[2]],
+        dtype=routed_experts.dtype,
+    )
+
+
+def _append_routed_experts(dst: MicroBatch, src: MicroBatch) -> None:
+    dst_routed = dst.routed_experts
+    src_routed = src.routed_experts
+    assert dst_routed is not None
+    assert src_routed is not None
+    assert dst_routed.dtype == src_routed.dtype
+    assert dst_routed.shape[1:] == src_routed.shape[1:]
+    dst_routed.data += src_routed.data
+    dst_routed.shape[0] += src_routed.shape[0]
+
+
+def _pad_routed_experts(micro_batch: MicroBatch, padding_size: int) -> None:
+    routed_experts = micro_batch.routed_experts
+    assert routed_experts is not None
+    row_size = _routed_experts_row_size(routed_experts)
+    routed_experts.data += b"\0" * (padding_size * row_size)
+    routed_experts.shape[0] += padding_size
 
 
 def prepare_sample(training_example: TrainingSample, seq_len: int) -> MicroBatch:
@@ -23,7 +69,9 @@ def prepare_sample(training_example: TrainingSample, seq_len: int) -> MicroBatch
     # Teacher logprobs already cover the full sequence (prompt + completion),
     # computed via prefill in the orchestrator when a teacher model is configured
     teacher_logprobs = training_example.teacher_logprobs
-    routed_experts = training_example.routed_experts
+    routed_experts = (
+        _copy_routed_experts(training_example.routed_experts) if training_example.routed_experts is not None else None
+    )
 
     if len(input_ids) > seq_len:
         input_ids = input_ids[:seq_len]
@@ -35,7 +83,7 @@ def prepare_sample(training_example: TrainingSample, seq_len: int) -> MicroBatch
         if teacher_logprobs is not None:
             teacher_logprobs = teacher_logprobs[:seq_len]
         if routed_experts is not None:
-            routed_experts = routed_experts[:seq_len]
+            routed_experts = _slice_routed_experts(routed_experts, seq_len)
         if mm_token_type_ids is not None:
             mm_token_type_ids = mm_token_type_ids[:seq_len]
 
@@ -53,9 +101,10 @@ def prepare_sample(training_example: TrainingSample, seq_len: int) -> MicroBatch
         assert len(teacher_logprobs) == len(input_ids), f"teacher_logprobs: {len(teacher_logprobs)}"
 
     if routed_experts is not None:
-        assert len(routed_experts) == len(input_ids), (
-            f"routed_experts: {len(routed_experts)}, input_ids: {len(input_ids)}"
+        assert routed_experts.shape[0] == len(input_ids), (
+            f"routed_experts: {routed_experts.shape}, input_ids: {len(input_ids)}"
         )
+        assert len(routed_experts.data) == len(input_ids) * _routed_experts_row_size(routed_experts)
 
     if mm_token_type_ids is not None:
         assert len(mm_token_type_ids) == len(input_ids), (
@@ -129,10 +178,9 @@ def packed_samples_into_micro_bs(
                     if bin_content.teacher_logprobs is None:
                         bin_content.teacher_logprobs = []
                     bin_content.teacher_logprobs.extend(sample.teacher_logprobs)
+                assert (bin_content.routed_experts is None) == (sample.routed_experts is None)
                 if sample.routed_experts is not None:
-                    if bin_content.routed_experts is None:
-                        bin_content.routed_experts = []
-                    bin_content.routed_experts.extend(sample.routed_experts)
+                    _append_routed_experts(bin_content, sample)
                 if sample.mm_token_type_ids is not None:
                     if bin_content.mm_token_type_ids is None:
                         bin_content.mm_token_type_ids = []
@@ -178,6 +226,8 @@ def pad_micro_batch(micro_batch: MicroBatch, pad_to_multiple_of: int) -> MicroBa
     )
     if micro_batch.mm_token_type_ids is not None:
         micro_batch.mm_token_type_ids.extend([0] * padding_size)
+    if micro_batch.routed_experts is not None:
+        _pad_routed_experts(micro_batch, padding_size)
 
     return micro_batch
 
diff --git a/src/prime_rl/trainer/rl/data.py b/src/prime_rl/trainer/rl/data.py
index ffc4bc627f..cabd126f59 100644
--- a/src/prime_rl/trainer/rl/data.py
+++ b/src/prime_rl/trainer/rl/data.py
@@ -12,6 +12,12 @@
 from prime_rl.trainer.world import get_world
 from prime_rl.transport import MicroBatch, MicroBatchReceiver, TransportConfig, setup_micro_batch_receiver
 
+ROUTED_EXPERTS_TORCH_DTYPES = {
+    "uint8": torch.uint8,
+    "int16": torch.int16,
+    "int32": torch.int32,
+}
+
 
 class TensorMicroBatch(TypedDict):
     """A micro batch of data for training."""
@@ -195,6 +201,18 @@ def _micro_batch_to_tensor(self, micro_batch: MicroBatch) -> TensorMicroBatch:
         if micro_batch.lora_num_tokens is None:
             micro_batch.lora_num_tokens = [0] * self.multi_run_manager.max_runs
             micro_batch.lora_num_tokens[0] = len(micro_batch.input_ids)
+        routed_experts = None
+        packed_routed_experts = micro_batch.routed_experts
+        if packed_routed_experts is not None:
+            routed_experts = (
+                torch.frombuffer(
+                    packed_routed_experts.data,
+                    dtype=ROUTED_EXPERTS_TORCH_DTYPES[packed_routed_experts.dtype],
+                )
+                .reshape(packed_routed_experts.shape)
+                .to(torch.int32)
+                .unsqueeze(0)
+            )
         return TensorMicroBatch(
             input_ids=torch.tensor(micro_batch.input_ids, dtype=torch.long).unsqueeze(0),
             position_ids=torch.tensor(micro_batch.position_ids, dtype=torch.long).unsqueeze(0),
@@ -218,10 +236,6 @@ def _micro_batch_to_tensor(self, micro_batch: MicroBatch) -> TensorMicroBatch:
             mm_token_type_ids=torch.tensor(micro_batch.mm_token_type_ids, dtype=torch.long).unsqueeze(0)
             if micro_batch.mm_token_type_ids is not None
             else None,
-            routed_experts=torch.tensor(micro_batch.routed_experts, dtype=torch.int32).unsqueeze(
-                0
-            )  # [1, seq_len, layers, topk]
-            if micro_batch.routed_experts is not None
-            else None,
+            routed_experts=routed_experts,
             sft_loss=micro_batch.sft_loss,
         )
diff --git a/src/prime_rl/transport/types.py b/src/prime_rl/transport/types.py
index 4bc594f06d..cc943e9b76 100644
--- a/src/prime_rl/transport/types.py
+++ b/src/prime_rl/transport/types.py
@@ -1,6 +1,14 @@
 import msgspec
 
 
+# Routed experts are large per-token arrays. tolist() is too expensive, so we
+# send raw bytes through msgpack and carry the shape/dtype needed to rebuild.
+class RoutedExperts(msgspec.Struct, array_like=True, gc=False, omit_defaults=True):
+    data: bytes
+    shape: list[int]  # [seq_len, layers, topk]
+    dtype: str
+
+
 # Orchestrator -> Packer
 class TrainingSample(msgspec.Struct, array_like=True, gc=False, omit_defaults=True):
     """A single training example."""
@@ -21,7 +29,7 @@ class TrainingSample(msgspec.Struct, array_like=True, gc=False, omit_defaults=Tr
     # image_grid_thw: grid dimensions [num_images, 3] where each entry is [temporal, height, width]
     image_grid_thw: list[list[int]] | None = None
 
-    routed_experts: list[list[list[int]]] | None = None  # [seq_len, layers, topk]
+    routed_experts: RoutedExperts | None = None
 
     # mm_token_type_ids: token type ids per token [batch seq], int64 (0=text, 1=image, 2=video)
     mm_token_type_ids: list[int] | None = None
@@ -49,7 +57,7 @@ class MicroBatch(msgspec.Struct, array_like=True, gc=False, omit_defaults=True):
     temperatures: list[float]  # Per-token temperatures used during generation
     teacher_logprobs: list[float] | None = None
     lora_num_tokens: list[int] | None = None
-    routed_experts: list[list[list[int]]] | None = None
+    routed_experts: RoutedExperts | None = None
 
     # Multimodal fields (Qwen3-VL) — pixel_values stored as raw float32 bytes for efficient serialization
     pixel_values: bytes | None = None
diff --git a/src/prime_rl/utils/client.py b/src/prime_rl/utils/client.py
index 21659dfc46..fedbdddb8e 100644
--- a/src/prime_rl/utils/client.py
+++ b/src/prime_rl/utils/client.py
@@ -68,6 +68,8 @@ def __init__(
         tool_parser: str | None = None,
         reasoning_parser: str | None = None,
         renderer_pool_size: int | None = None,
+        preserve_all_thinking: bool = False,
+        preserve_thinking_between_tool_calls: bool = False,
     ):
         renderer_model_name = model_name if train_client_type == "renderer" else None
         self._train_clients = setup_clients(
@@ -78,6 +80,8 @@ def __init__(
             tool_parser=tool_parser,
             reasoning_parser=reasoning_parser,
             renderer_pool_size=renderer_pool_size,
+            preserve_all_thinking=preserve_all_thinking,
+            preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls,
         )
         self._eval_clients = setup_clients(client_config, client_type=eval_client_type)
         self._admin_clients = setup_admin_clients(client_config)
@@ -129,6 +133,8 @@ async def setup_inference_pool(
     tool_parser: str | None = None,
     reasoning_parser: str | None = None,
     renderer_pool_size: int | None = None,
+    preserve_all_thinking: bool = False,
+    preserve_thinking_between_tool_calls: bool = False,
 ) -> InferencePool:
     """Create an inference pool from config (static or elastic)."""
     logger = get_logger()
@@ -152,6 +158,8 @@ async def setup_inference_pool(
             tool_parser=tool_parser,
             reasoning_parser=reasoning_parser,
             renderer_pool_size=renderer_pool_size,
+            preserve_all_thinking=preserve_all_thinking,
+            preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls,
         )
 
     logger.info(
@@ -168,6 +176,8 @@ async def setup_inference_pool(
         tool_parser=tool_parser,
         reasoning_parser=reasoning_parser,
         renderer_pool_size=renderer_pool_size,
+        preserve_all_thinking=preserve_all_thinking,
+        preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls,
     )
 
 
@@ -179,6 +189,8 @@ def setup_clients(
     tool_parser: str | None = None,
     reasoning_parser: str | None = None,
     renderer_pool_size: int | None = None,
+    preserve_all_thinking: bool = False,
+    preserve_thinking_between_tool_calls: bool = False,
 ) -> list[vf.ClientConfig]:
     clients = []
     client_idx = 0
@@ -196,6 +208,8 @@ def setup_clients(
                     renderer_pool_size=renderer_pool_size,
                     tool_parser=tool_parser,
                     reasoning_parser=reasoning_parser,
+                    preserve_all_thinking=preserve_all_thinking,
+                    preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls,
                     api_base_url=base_url,
                     api_key_var=client_config.api_key_var,
                     timeout=client_config.timeout,
diff --git a/src/prime_rl/utils/elastic.py b/src/prime_rl/utils/elastic.py
index 902f873903..c59f81e27f 100644
--- a/src/prime_rl/utils/elastic.py
+++ b/src/prime_rl/utils/elastic.py
@@ -110,6 +110,8 @@ def __init__(
         tool_parser: str | None = None,
         reasoning_parser: str | None = None,
         renderer_pool_size: int | None = None,
+        preserve_all_thinking: bool = False,
+        preserve_thinking_between_tool_calls: bool = False,
     ):
         self.logger = get_logger()
         self.client_config = client_config
@@ -125,6 +127,8 @@ def __init__(
         self.tool_parser = tool_parser
         self.reasoning_parser = reasoning_parser
         self.renderer_pool_size = renderer_pool_size
+        self.preserve_all_thinking = preserve_all_thinking
+        self.preserve_thinking_between_tool_calls = preserve_thinking_between_tool_calls
         self.router_url = client_config.router_url
 
         self._servers: dict[str, ServerState] = {}
@@ -152,6 +156,8 @@ async def from_config(
         tool_parser: str | None = None,
         reasoning_parser: str | None = None,
         renderer_pool_size: int | None = None,
+        preserve_all_thinking: bool = False,
+        preserve_thinking_between_tool_calls: bool = False,
     ) -> ElasticInferencePool:
         if client_config.elastic is None:
             raise ValueError("Elastic inference pool requires elastic config")
@@ -164,6 +170,8 @@ async def from_config(
             tool_parser=tool_parser,
             reasoning_parser=reasoning_parser,
             renderer_pool_size=renderer_pool_size,
+            preserve_all_thinking=preserve_all_thinking,
+            preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls,
         )
         await pool.start()
         return pool
@@ -214,6 +222,8 @@ def _rebuild_clients(self) -> None:
                     tool_parser=self.tool_parser,
                     reasoning_parser=self.reasoning_parser,
                     renderer_pool_size=self.renderer_pool_size,
+                    preserve_all_thinking=self.preserve_all_thinking,
+                    preserve_thinking_between_tool_calls=self.preserve_thinking_between_tool_calls,
                 )
                 if urls
                 else []
diff --git a/tests/unit/inference/test_serving_tokens.py b/tests/unit/inference/test_serving_tokens.py
index ac5b52b3d4..1882e57e55 100644
--- a/tests/unit/inference/test_serving_tokens.py
+++ b/tests/unit/inference/test_serving_tokens.py
@@ -3,8 +3,7 @@
 The full happy-path is owned upstream by vLLM 0.20's
 ``vllm/entrypoints/serve/disagg`` test suite. We only cover the prime-RL
 deltas here:
-    * ``encode_routed_experts`` round-trips a numpy array as expected.
-    * ``PrimeRlGenerateResponseChoice`` accepts the optional field.
+    * ``serialize_routed_experts`` round-trips a compact raw-byte payload.
     * The subclass attaches its overrides without monkey-patching the parent.
     * ``_client_set_max_tokens`` distinguishes raw-body shapes correctly.
 """
@@ -12,19 +11,26 @@
 from __future__ import annotations
 
 import asyncio
-import base64
 
 import numpy as np
+import pybase64
+from vllm.entrypoints.serve.disagg.protocol import GenerateResponse, GenerateResponseChoice
 
+from prime_rl.inference.vllm.routed_experts import serialize_routed_experts
 from prime_rl.inference.vllm.serving_tokens import (
-    PrimeRlGenerateResponse,
-    PrimeRlGenerateResponseChoice,
     PrimeRlServingTokens,
     _client_set_max_tokens,
-    encode_routed_experts,
+    _GenerateRoutedExpertsCapture,
 )
 
 
+def _decode_routed_experts(encoded: dict) -> np.ndarray:
+    return np.frombuffer(
+        pybase64.b64decode_as_bytearray(encoded["data"]),
+        dtype=np.uint8,
+    ).reshape(encoded["shape"])
+
+
 class _FakeRawRequest:
     def __init__(self, body):
         self._body = body
@@ -36,50 +42,54 @@ async def json(self):
         return self._body
 
 
-def test_encode_routed_experts_roundtrip():
-    arr = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
-    encoded = encode_routed_experts(arr)
-
-    assert encoded["shape"] == [2, 3]
-    decoded = np.frombuffer(base64.b85decode(encoded["data"]), dtype=np.int32).reshape(encoded["shape"])
-    np.testing.assert_array_equal(decoded, arr)
-
+async def _empty_request_outputs():
+    if False:
+        yield
 
-def test_routed_experts_choice_accepts_none_and_dict():
-    no_re = PrimeRlGenerateResponseChoice(index=0, finish_reason="stop", token_ids=[1, 2])
-    assert no_re.routed_experts is None
 
-    encoded = encode_routed_experts(np.zeros((1, 1), dtype=np.int32))
-    with_re = PrimeRlGenerateResponseChoice(index=0, finish_reason="stop", token_ids=[1], routed_experts=encoded)
-    assert with_re.routed_experts == encoded
-
-
-def test_response_only_serializes_declared_fields():
-    # Upstream silently drops id=/created=/model=/usage= because they're not
-    # declared on GenerateResponse. Our subclass adds nothing to that surface
-    # — it only widens the choices type — so the JSON shape stays slim.
-    resp = PrimeRlGenerateResponse(
-        request_id="gen-x",
-        choices=[PrimeRlGenerateResponseChoice(index=0, finish_reason="stop", token_ids=[7])],
-    )
-    dumped = resp.model_dump()
-    assert set(dumped.keys()) == {
-        "request_id",
-        "choices",
-        "prompt_logprobs",
-        "kv_transfer_params",
-    }
-    assert dumped["choices"][0]["routed_experts"] is None
-
-
-def test_subclass_inherits_serve_tokens_full_generator():
-    # The subclass adds an override; make sure we didn't accidentally rebind
-    # ``serve_tokens`` to a parent attribute via __dict__-update tricks later.
+def test_subclass_only_overrides_serve_tokens():
+    assert PrimeRlServingTokens.serve_tokens is not PrimeRlServingTokens.__mro__[1].serve_tokens
     assert (
         PrimeRlServingTokens.serve_tokens_full_generator
         is not PrimeRlServingTokens.__mro__[1].serve_tokens_full_generator
     )
-    assert PrimeRlServingTokens.serve_tokens is not PrimeRlServingTokens.__mro__[1].serve_tokens
+
+
+def test_serialize_routed_experts_uses_compact_raw_payload():
+    routed_experts = np.array(
+        [
+            [[1, 2], [3, 4]],
+            [[5, 6], [7, 8]],
+        ],
+        dtype=np.int64,
+    )
+
+    encoded = serialize_routed_experts(routed_experts)
+    assert encoded is not None
+
+    decoded = _decode_routed_experts(encoded)
+    assert decoded.dtype == np.uint8
+    np.testing.assert_array_equal(decoded, routed_experts)
+
+
+def test_generate_response_post_process_replaces_upstream_routed_experts():
+    compact_routed_experts = {"data": "AQID", "shape": [1, 1, 3]}
+    capture = _GenerateRoutedExpertsCapture(_empty_request_outputs())
+    capture.routed_experts[0] = compact_routed_experts
+    response = GenerateResponse(
+        request_id="request-id",
+        choices=[
+            GenerateResponseChoice(
+                index=0,
+                token_ids=[1, 2, 3],
+                routed_experts="upstream-npy-payload",
+            )
+        ],
+    )
+
+    processed = capture.post_process(response)
+
+    assert processed.choices[0].routed_experts == compact_routed_experts
 
 
 def test_client_set_max_tokens_recognizes_explicit_value():
diff --git a/tests/unit/orchestrator/test_batch.py b/tests/unit/orchestrator/test_batch.py
index a2e2e50079..fc95de4e2f 100644
--- a/tests/unit/orchestrator/test_batch.py
+++ b/tests/unit/orchestrator/test_batch.py
@@ -1,7 +1,17 @@
+import numpy as np
 import pytest
 
 from prime_rl.trainer.batch import prepare_batch, prepare_sample
-from prime_rl.transport.types import TrainingSample
+from prime_rl.transport.types import RoutedExperts, TrainingSample
+
+
+def _routed_experts(data, dtype=np.uint8):
+    routed_experts = np.asarray(data, dtype=dtype)
+    return RoutedExperts(
+        data=routed_experts.tobytes(),
+        shape=list(routed_experts.shape),
+        dtype=str(routed_experts.dtype),
+    )
 
 
 @pytest.fixture
@@ -109,6 +119,7 @@ def test_prepare_sample_with_routed_experts():
     """Routed experts are passed through prepare_sample and match input_ids length."""
     # 2 prompt + 2 completion = 4 tokens, 2 layers, topk=2
     routed_experts = [[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[0, 2], [1, 3]], [[1, 0], [3, 2]]]
+    routed = _routed_experts(routed_experts)
     sample = TrainingSample(
         prompt_ids=[1, 2],
         prompt_mask=[False, False],
@@ -117,18 +128,21 @@ def test_prepare_sample_with_routed_experts():
         completion_logprobs=[-0.1, -0.2],
         completion_temperatures=[1.0, 1.0],
         advantage=1.0,
-        routed_experts=routed_experts,
+        routed_experts=routed,
     )
 
     micro_batch = prepare_sample(sample, seq_len=8)
     assert micro_batch.routed_experts is not None
-    assert len(micro_batch.routed_experts) == 4
-    assert micro_batch.routed_experts == routed_experts
+    assert micro_batch.routed_experts.data == routed.data
+    assert micro_batch.routed_experts.shape == routed.shape
+    assert micro_batch.routed_experts.dtype == routed.dtype
 
 
 def test_prepare_sample_truncates_routed_experts():
     """Routed experts are truncated to seq_len when input exceeds it."""
     routed_experts = [[[0, 1]], [[2, 3]], [[4, 5]], [[6, 7]]]
+    routed = _routed_experts(routed_experts)
+    expected = _routed_experts(routed_experts[:3])
     sample = TrainingSample(
         prompt_ids=[1, 2],
         prompt_mask=[False, False],
@@ -137,13 +151,14 @@ def test_prepare_sample_truncates_routed_experts():
         completion_logprobs=[-0.1, -0.2],
         completion_temperatures=[1.0, 1.0],
         advantage=1.0,
-        routed_experts=routed_experts,
+        routed_experts=routed,
     )
 
     micro_batch = prepare_sample(sample, seq_len=3)
     assert micro_batch.routed_experts is not None
-    assert len(micro_batch.routed_experts) == 3
-    assert micro_batch.routed_experts == routed_experts[:3]
+    assert micro_batch.routed_experts.data == expected.data
+    assert micro_batch.routed_experts.shape == expected.shape
+    assert micro_batch.routed_experts.dtype == expected.dtype
 
 
 def test_prepare_sample_none_routed_experts():
diff --git a/tests/unit/orchestrator/test_orchestrator_setup.py b/tests/unit/orchestrator/test_orchestrator_setup.py
index ff9bb5b79f..5c5b420fc5 100644
--- a/tests/unit/orchestrator/test_orchestrator_setup.py
+++ b/tests/unit/orchestrator/test_orchestrator_setup.py
@@ -50,6 +50,8 @@ async def run() -> None:
                 tool_parser=None,
                 reasoning_parser=None,
                 pool_size=None,
+                preserve_all_thinking=True,
+                preserve_thinking_between_tool_calls=False,
             ),
         )
         rollout_client_config = SimpleNamespace(base_url=["http://localhost:8000/v1"])
@@ -79,6 +81,8 @@ async def run() -> None:
             renderer="qwen3_vl",
             tool_parser=None,
             reasoning_parser=None,
+            preserve_all_thinking=True,
+            preserve_thinking_between_tool_calls=False,
         )
         setup_pool_mock.assert_awaited_once_with(
             rollout_client_config,
@@ -89,6 +93,8 @@ async def run() -> None:
             tool_parser=None,
             reasoning_parser=None,
             renderer_pool_size=None,
+            preserve_all_thinking=True,
+            preserve_thinking_between_tool_calls=False,
         )
 
     asyncio.run(run())
diff --git a/tests/unit/orchestrator/test_trajectories.py b/tests/unit/orchestrator/test_trajectories.py
index 6fa169760c..303a02fd11 100644
--- a/tests/unit/orchestrator/test_trajectories.py
+++ b/tests/unit/orchestrator/test_trajectories.py
@@ -3,6 +3,7 @@
 from unittest.mock import MagicMock
 
 import numpy as np
+import pybase64
 import pytest
 import verifiers as vf
 from PIL import Image
@@ -30,6 +31,21 @@ def _decode_pixels(pixel_bytes: bytes, shape: list[int]) -> list[list[float]]:
     return np.frombuffer(pixel_bytes, dtype=np.float32).reshape(shape).tolist()
 
 
+def _routed_experts_payload(data) -> dict:
+    arr = np.asarray(data, dtype=np.uint8)
+    return {
+        "data": pybase64.b64encode(memoryview(np.ascontiguousarray(arr))).decode("ascii"),
+        "shape": list(arr.shape),
+    }
+
+
+def _sample_routed_experts(sample) -> np.ndarray:
+    assert sample.routed_experts is not None
+    return np.frombuffer(sample.routed_experts.data, dtype=np.dtype(sample.routed_experts.dtype)).reshape(
+        sample.routed_experts.shape
+    )
+
+
 def test_deserialize_tool_calls_does_not_inject_missing_key():
     messages = [{"role": "assistant", "content": "hello"}]
 
@@ -1857,40 +1873,43 @@ def test_align_routed_experts_none():
 
 
 def test_align_routed_experts_empty():
-    result = _align_routed_experts([], 10)
-    assert result == []
+    experts = np.empty((0, 2, 2), dtype=np.uint8)
+    result = _align_routed_experts(experts, 10)
+    assert result is not None
+    assert result.shape == (10, 2, 2)
+    assert np.all(result == 0)
 
 
 def test_align_routed_experts_no_deficit():
     # 3 tokens, 2 layers, topk=2
-    experts = [[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[0, 2], [1, 3]]]
+    experts = np.asarray([[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[0, 2], [1, 3]]], dtype=np.uint8)
     result = _align_routed_experts(experts, expected_len=3)
-    assert result == experts
+    np.testing.assert_array_equal(result, experts)
 
 
 def test_align_routed_experts_with_deficit():
     # 2 tokens but expected 4 (deficit of 2)
-    experts = [[[1, 2], [3, 4]], [[5, 6], [7, 0]]]
+    experts = np.asarray([[[1, 2], [3, 4]], [[5, 6], [7, 0]]], dtype=np.uint8)
     result = _align_routed_experts(experts, expected_len=4)
-    assert len(result) == 4
-    assert result[:2] == experts
+    assert result is not None
+    assert result.shape == (4, 2, 2)
+    np.testing.assert_array_equal(result[:2], experts)
     # Padded entries should be zero-filled with same shape [layers=2, topk=2]
-    assert result[2] == [[0, 0], [0, 0]]
-    assert result[3] == [[0, 0], [0, 0]]
+    np.testing.assert_array_equal(result[2], [[0, 0], [0, 0]])
+    np.testing.assert_array_equal(result[3], [[0, 0], [0, 0]])
 
 
 def test_align_routed_experts_excess_length():
-    experts = [[[1, 2]], [[3, 4]], [[5, 6]]]
+    experts = np.asarray([[[1, 2]], [[3, 4]], [[5, 6]]], dtype=np.uint8)
     result = _align_routed_experts(experts, expected_len=2)
-    # No truncation, just returns as-is
-    assert result == experts
+    np.testing.assert_array_equal(result, experts[:2])
 
 
 def test_interleave_rollout_single_step_with_routed_experts():
     """Routed experts are aligned and passed through for a single-step trajectory."""
     # prompt_ids=[1,2], completion_ids=[3,4] -> total 4 tokens
     # vLLM returns num_tokens-1 = 3 routed expert entries
-    routed_experts_from_vllm = [[[0, 1]], [[2, 3]], [[4, 5]]]  # 3 entries, 1 layer, topk=2
+    routed_experts_from_vllm = np.asarray([[[0, 1]], [[2, 3]], [[4, 5]]], dtype=np.uint8)
     output = vf.RolloutOutput(
         example_id=0,
         trajectory=[
@@ -1906,7 +1925,7 @@ def test_interleave_rollout_single_step_with_routed_experts():
                     completion_logprobs=[-0.1, -0.2],
                     overlong_prompt=False,
                     is_truncated=False,
-                    routed_experts=routed_experts_from_vllm,
+                    routed_experts=_routed_experts_payload(routed_experts_from_vllm),
                 ),
                 reward=None,
                 advantage=None,
@@ -1926,18 +1945,19 @@ def test_interleave_rollout_single_step_with_routed_experts():
 
     # Should be aligned to 4 tokens (2 prompt + 2 completion)
     assert sample.routed_experts is not None
-    assert len(sample.routed_experts) == 4
+    routed_experts = _sample_routed_experts(sample)
+    assert routed_experts.shape == (4, 1, 2)
     # First 3 are original, last one is zero-padded
-    assert sample.routed_experts[:3] == routed_experts_from_vllm
-    assert sample.routed_experts[3] == [[0, 0]]
+    np.testing.assert_array_equal(routed_experts[:3], routed_experts_from_vllm)
+    np.testing.assert_array_equal(routed_experts[3], [[0, 0]])
 
 
 def test_interleave_rollout_multi_step_with_routed_experts():
     """Routed experts are extended and aligned across multi-step trajectories."""
     # Step 1: prompt=[1,2], completion=[3,4] -> 4 tokens, vLLM returns 3
-    step1_experts = [[[1, 2]], [[3, 4]], [[5, 6]]]
+    step1_experts = np.asarray([[[1, 2]], [[3, 4]], [[5, 6]]], dtype=np.uint8)
     # Step 2: prompt=[1,2,3,4,5,6], completion=[7,8] -> 8 tokens, vLLM returns 7
-    step2_experts = [[[1, 0]], [[2, 0]], [[3, 0]], [[4, 0]], [[5, 0]], [[6, 0]], [[7, 0]]]
+    step2_experts = np.asarray([[[1, 0]], [[2, 0]], [[3, 0]], [[4, 0]], [[5, 0]], [[6, 0]], [[7, 0]]], dtype=np.uint8)
 
     output = vf.RolloutOutput(
         example_id=0,
@@ -1954,7 +1974,7 @@ def test_interleave_rollout_multi_step_with_routed_experts():
                     completion_logprobs=[-0.1, -0.2],
                     overlong_prompt=False,
                     is_truncated=False,
-                    routed_experts=step1_experts,
+                    routed_experts=_routed_experts_payload(step1_experts),
                 ),
                 reward=None,
                 advantage=None,
@@ -1978,7 +1998,7 @@ def test_interleave_rollout_multi_step_with_routed_experts():
                     completion_logprobs=[-0.3, -0.4],
                     overlong_prompt=False,
                     is_truncated=False,
-                    routed_experts=step2_experts,
+                    routed_experts=_routed_experts_payload(step2_experts),
                 ),
                 reward=None,
                 advantage=None,
@@ -1999,7 +2019,7 @@ def test_interleave_rollout_multi_step_with_routed_experts():
     # Merged sample: prompt=[1,2], completion=[3,4,5,6,7,8] -> 8 tokens total
     assert len(sample.prompt_ids) + len(sample.completion_ids) == 8
     assert sample.routed_experts is not None
-    assert len(sample.routed_experts) == 8
+    assert _sample_routed_experts(sample).shape == (8, 1, 2)
 
 
 def test_interleave_rollout_none_routed_experts_stays_none():
diff --git a/tests/unit/utils/test_client.py b/tests/unit/utils/test_client.py
index 6b48790ef3..3b13e30bd3 100644
--- a/tests/unit/utils/test_client.py
+++ b/tests/unit/utils/test_client.py
@@ -62,10 +62,13 @@ def test_setup_clients_assigns_renderer_and_dp_rank_headers():
         client_config,
         client_type="renderer",
         renderer_name="qwen3_vl",
+        preserve_all_thinking=True,
     )
 
     assert [client.client_type for client in clients] == ["renderer", "renderer"]
     assert [client.renderer for client in clients] == ["qwen3_vl", "qwen3_vl"]
+    assert [client.preserve_all_thinking for client in clients] == [True, True]
+    assert [client.preserve_thinking_between_tool_calls for client in clients] == [False, False]
     assert [client.renderer_model_name for client in clients] == [None, None]
     assert [client.api_base_url for client in clients] == ["http://worker-a:8000/v1"] * 2
     assert [client.extra_headers["X-data-parallel-rank"] for client in clients] == ["0", "1"]
diff --git a/uv.lock b/uv.lock
index e5c35957d3..d3d6d567fe 100644
--- a/uv.lock
+++ b/uv.lock
@@ -11,38 +11,39 @@ supported-markers = [
 ]
 
 [options]
-exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values.
+exclude-newer = "2026-05-09T02:04:57.89664956Z"
 exclude-newer-span = "P7D"
 
 [options.exclude-newer-package]
-vllm = false
 verifiers = false
-vllm-router = false
 dion = false
 alphabet-sort = false
 science-env = false
-color-codeword = false
-nixl-cu12 = false
-flash-attn-3 = false
-prime-tunnel = false
-prime = false
-deep-gemm = false
-aime2024 = false
 prime-evals = false
 deepdive = false
-prime-sandboxes = false
 reverse-text = false
 code-env = false
 mini-swe-agent-plus = false
 deep-ep = false
 pydantic-config = false
 renderers = false
-math-env = false
-logic-env = false
 wiki-search = false
 math-python = false
 math500 = false
 aime2025 = false
+vllm = false
+vllm-router = false
+color-codeword = false
+nixl-cu12 = false
+flash-attn-3 = false
+prime-tunnel = false
+deep-gemm = false
+aime2024 = false
+tokenspeed-mla = false
+math-env = false
+logic-env = false
+prime-sandboxes = false
+prime = false
 
 [manifest]
 members = [
@@ -1890,15 +1891,18 @@ wheels = [
 name = "mistral-common"
 version = "1.11.0"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'aarch64' and sys_platform == 'linux'",
+]
 dependencies = [
-    { name = "jsonschema", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "numpy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "pillow", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "pydantic", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "pydantic-extra-types", extra = ["pycountry"], marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "requests", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "tiktoken", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "typing-extensions", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "jsonschema", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "pydantic", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "pydantic-extra-types", extra = ["pycountry"], marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "requests", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "tiktoken", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/61/97/753c85b5c0a19f4331ac99e0300ac8da06d4b29b629c9cb03064b38561bd/mistral_common-1.11.0.tar.gz", hash = "sha256:439b7fa38f9c3f020154af51bdf30eb81def507643017d8ce9f798384ec47ec3", size = 6355512, upload-time = "2026-04-01T13:54:12.36Z" }
 wheels = [
@@ -1907,7 +1911,34 @@ wheels = [
 
 [package.optional-dependencies]
 image = [
-    { name = "opencv-python-headless", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "opencv-python-headless", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+]
+
+[[package]]
+name = "mistral-common"
+version = "1.11.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "jsonschema", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "numpy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "pillow", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "pydantic", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "pydantic-extra-types", extra = ["pycountry"], marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "requests", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "tiktoken", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c2/eb/12167a1bea9714582e5b4f539f9c019323363e314a499c72855ff0e5ad43/mistral_common-1.11.2.tar.gz", hash = "sha256:79f68fc2d1190f28637f40e053f919c8c2697e00b2aa679ddee562a95183f4ad", size = 6357845, upload-time = "2026-05-04T19:47:40.413Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/47/f0/6a5d604b972e442b9d36c117d01788feddad099e4965699e3516ee6fefc3/mistral_common-1.11.2-py3-none-any.whl", hash = "sha256:ebb42062cd705a0aa2bc69b4cde2b83d446ae58150b7e29322c90cb08fcfca6c", size = 6531968, upload-time = "2026-05-04T19:47:37.718Z" },
+]
+
+[package.optional-dependencies]
+image = [
+    { name = "opencv-python-headless", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
 ]
 
 [[package]]
@@ -1964,20 +1995,44 @@ wheels = [
 name = "model-hosting-container-standards"
 version = "0.1.13"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'aarch64' and sys_platform == 'linux'",
+]
 dependencies = [
-    { name = "fastapi", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "httpx", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "jmespath", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "pydantic", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "setuptools", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "starlette", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "supervisor", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "fastapi", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "httpx", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "jmespath", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "pydantic", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "setuptools", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "starlette", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "supervisor", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/d7/b7/a6a31b4dfd30d14b1019dc358f09c9d88ca38e555ba7c976e7d3e6b593fe/model_hosting_container_standards-0.1.13.tar.gz", hash = "sha256:27a1333410dde2719286a300a2803e24fdde407baa91894eb845c0f268aa194d", size = 79116, upload-time = "2026-01-09T21:45:20.683Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8c/37/6dc61971ba31450bbed460b5f40543f0915e352680534e3bcaf57116d8d7/model_hosting_container_standards-0.1.13-py3-none-any.whl", hash = "sha256:be307d4a988cc660df4e6bd8bdedb7917844bac940e332f9fd001cb385d7994c", size = 105738, upload-time = "2026-01-09T21:45:18.959Z" },
 ]
 
+[[package]]
+name = "model-hosting-container-standards"
+version = "0.1.15"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "fastapi", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "httpx", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "jmespath", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "pydantic", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "setuptools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "starlette", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "supervisor", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/03/5a/d669bdeb5ba96db42c6ef010835a25119b05f8c35ee5f1c3f715626625fe/model_hosting_container_standards-0.1.15.tar.gz", hash = "sha256:ae8dd74d3250545c14f0a7068186c7b0f0ab6563d31e7137f556b6b660c8a6a9", size = 93994, upload-time = "2026-05-05T18:22:29.357Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/61/26/c7aea197f1719f31d0dd686eb4475982fe9efd7668ce259cb52b62c676b6/model_hosting_container_standards-0.1.15-py3-none-any.whl", hash = "sha256:849e08c4732203ee861c8c24966b4e916ea4420fa324b430f7f74a1e1fe8811a", size = 125418, upload-time = "2026-05-05T18:22:27.819Z" },
+]
+
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@@ -2767,6 +2822,7 @@ dependencies = [
     { name = "prime", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "prime-rl-configs", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "pyarrow", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "pybase64", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "pyzmq", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "renderers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "rich", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
@@ -2782,8 +2838,8 @@ dependencies = [
     { name = "transformers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "uvloop", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "verifiers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "vllm", version = "0.20.2rc1.dev354+g24337fb86.cu129", source = { url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/vllm-0.20.2rc1.dev354+g24337fb86.cu129-cp38-abi3-manylinux_2_34_x86_64.whl" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "vllm", version = "0.20.2+cu129", source = { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_aarch64.whl" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "vllm", version = "0.20.2+cu129", source = { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_x86_64.whl" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "wandb", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
 
@@ -2890,6 +2946,7 @@ requires-dist = [
     { name = "prime-rl", extras = ["quack"], marker = "extra == 'all'" },
     { name = "prime-rl-configs", editable = "packages/prime-rl-configs" },
     { name = "pyarrow", specifier = ">=21.0.0" },
+    { name = "pybase64", specifier = ">=1.4.2" },
     { name = "pyzmq", specifier = ">=27.1.0" },
     { name = "quack-kernels", marker = "extra == 'quack'", specifier = ">=0.3.3" },
     { name = "renderers", specifier = "==0.1.6" },
@@ -2907,11 +2964,11 @@ requires-dist = [
     { name = "torchvision", index = "https://download.pytorch.org/whl/cu128" },
     { name = "transformers", git = "https://github.com/huggingface/transformers.git?rev=c1c3424" },
     { name = "uvloop", specifier = ">=0.21.0" },
-    { name = "verifiers", git = "https://github.com/PrimeIntellect-ai/verifiers.git?rev=aa428f3" },
-    { name = "vllm", marker = "platform_machine != 'aarch64' and platform_machine != 'x86_64'", specifier = ">=0.20.2" },
+    { name = "verifiers", git = "https://github.com/PrimeIntellect-ai/verifiers?rev=3708ede" },
+    { name = "vllm", marker = "platform_machine != 'aarch64' and platform_machine != 'x86_64'" },
     { name = "vllm", marker = "platform_machine == 'aarch64'", url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_aarch64.whl" },
-    { name = "vllm", marker = "platform_machine == 'x86_64'", url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_x86_64.whl" },
-    { name = "vllm-router", marker = "platform_machine == 'x86_64' and extra == 'disagg'", url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.22/vllm_router-0.1.22-cp38-abi3-manylinux_2_28_x86_64.whl" },
+    { name = "vllm", marker = "platform_machine == 'x86_64'", url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/vllm-0.20.2rc1.dev354+g24337fb86.cu129-cp38-abi3-manylinux_2_34_x86_64.whl" },
+    { name = "vllm-router", marker = "platform_machine == 'x86_64' and extra == 'disagg'", url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.25/vllm_router-0.1.25-cp38-abi3-manylinux_2_28_x86_64.whl" },
     { name = "wandb", specifier = ">=0.26.1" },
     { name = "wiki-search", marker = "extra == 'envs'", index = "https://hub.primeintellect.ai/primeintellect/simple/" },
 ]
@@ -3858,6 +3915,28 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/05/a1/d62dfe7376beaaf1394917e0f8e93ee5f67fea8fcf4107501db35996586b/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38337540fbbddff8e999d59970f3c6f35a82de10053206a7562f1ea02d046fa5", size = 10033429, upload-time = "2026-01-05T10:45:14.333Z" },
 ]
 
+[[package]]
+name = "tokenspeed-mla"
+version = "0.1.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "apache-tvm-ffi", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cutlass-dsl", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "tokenspeed-triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "torch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/84/01/4bf8b74ead3e8e7c1c809435396254c067a33fde48acc20f602aae622d97/tokenspeed_mla-0.1.2-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:c9466a351fe039792e56cf49f3e79744c1dc28c7af10306a02e62b8e92fa5985", size = 748681, upload-time = "2026-05-13T03:30:56.718Z" },
+]
+
+[[package]]
+name = "tokenspeed-triton"
+version = "3.7.10.post20260505"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/af/c3/4808d86016368fed9495c3a3408cc7f912e7863ff3432937404bd0a551a6/tokenspeed_triton-3.7.10.post20260505-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:19618c7db01a9bd33885f7acbf8945adb2f5534668aa97629b56d481753cbcad", size = 89127692, upload-time = "2026-05-05T07:49:04.22Z" },
+]
+
 [[package]]
 name = "toml"
 version = "0.10.2"
@@ -4197,8 +4276,8 @@ wheels = [
 
 [[package]]
 name = "verifiers"
-version = "0.1.14"
-source = { git = "https://github.com/PrimeIntellect-ai/verifiers.git?rev=aa428f3#aa428f3941ae35a7cf7c0dad7e60c7eca525bac6" }
+version = "0.1.15.dev5"
+source = { git = "https://github.com/PrimeIntellect-ai/verifiers?rev=3708ede#3708ede31d16b77866befa3c7a97cf94b5062cd3" }
 dependencies = [
     { name = "aiolimiter", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "anthropic", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
@@ -4215,6 +4294,7 @@ dependencies = [
     { name = "openai-agents", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "prime-sandboxes", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "prime-tunnel", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "pybase64", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "pydantic", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "pyzmq", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "regex", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
@@ -4223,7 +4303,6 @@ dependencies = [
     { name = "setproctitle", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "tenacity", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "textual", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "wget", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
 
 [[package]]
@@ -4242,83 +4321,84 @@ wheels = [
 
 [[package]]
 name = "vllm"
-version = "0.20.2+cu129"
-source = { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_aarch64.whl" }
+version = "0.20.2rc1.dev354+g24337fb86.cu129"
+source = { url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/vllm-0.20.2rc1.dev354+g24337fb86.cu129-cp38-abi3-manylinux_2_34_x86_64.whl" }
 resolution-markers = [
-    "platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
 ]
 dependencies = [
-    { name = "aiohttp", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "anthropic", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "apache-tvm-ffi", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "blake3", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "cachetools", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "cbor2", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "cloudpickle", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "compressed-tensors", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "depyf", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "diskcache", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "einops", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "fastapi", extra = ["standard"], marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "fastsafetensors", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "filelock", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "flashinfer-cubin", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "flashinfer-python", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "gguf", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "ijson", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "lark", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "llguidance", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "lm-format-enforcer", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "mcp", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "mistral-common", extra = ["image"], marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "model-hosting-container-standards", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "msgspec", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "ninja", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "numba", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "nvidia-cudnn-frontend", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "nvidia-cutlass-dsl", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "openai", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "openai-harmony", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "opencv-python-headless", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "opentelemetry-api", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "opentelemetry-exporter-otlp", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "opentelemetry-sdk", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "opentelemetry-semantic-conventions-ai", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "outlines-core", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "partial-json-parser", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "prometheus-client", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "prometheus-fastapi-instrumentator", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "protobuf", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "psutil", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "py-cpuinfo", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "pybase64", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "pydantic", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "python-json-logger", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "pyyaml", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "pyzmq", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "quack-kernels", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "regex", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "requests", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "sentencepiece", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "setproctitle", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "setuptools", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "six", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "tiktoken", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "tilelang", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "tokenizers", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "torchaudio", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "torchvision", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "tqdm", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "transformers", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "typing-extensions", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "watchfiles", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
-    { name = "xgrammar", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "aiohttp", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "anthropic", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "apache-tvm-ffi", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "blake3", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "cachetools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "cbor2", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "cloudpickle", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "compressed-tensors", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "depyf", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "diskcache", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "einops", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "fastapi", extra = ["standard"], marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "fastsafetensors", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "filelock", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "flashinfer-cubin", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "flashinfer-python", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "gguf", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "ijson", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "lark", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "llguidance", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "lm-format-enforcer", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "mcp", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "mistral-common", version = "1.11.2", source = { registry = "https://pypi.org/simple" }, extra = ["image"], marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "model-hosting-container-standards", version = "0.1.15", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "msgspec", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "ninja", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "numba", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "numpy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-frontend", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cutlass-dsl", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "openai", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "openai-harmony", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "opencv-python-headless", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "opentelemetry-api", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "opentelemetry-exporter-otlp", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "opentelemetry-sdk", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "opentelemetry-semantic-conventions-ai", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "outlines-core", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "partial-json-parser", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "pillow", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "prometheus-client", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "prometheus-fastapi-instrumentator", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "protobuf", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "psutil", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "py-cpuinfo", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "pybase64", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "pydantic", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "python-json-logger", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "pyyaml", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "pyzmq", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "quack-kernels", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "regex", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "requests", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "sentencepiece", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "setproctitle", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "setuptools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "six", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "tiktoken", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "tilelang", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "tokenizers", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "tokenspeed-mla", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "torch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "torchaudio", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "torchvision", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "tqdm", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "transformers", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "watchfiles", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "xgrammar", version = "0.2.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_aarch64.whl", hash = "sha256:8a58a086c5c4ed2883eee36aaaf6b79c83463d02da3015454acf92afcc8e150e" },
+    { url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/vllm-0.20.2rc1.dev354+g24337fb86.cu129-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a16f4fd2d468f0bb0afd84e3e96f4016654e8525892879909f7a095e33101668" },
 ]
 
 [package.metadata]
@@ -4352,14 +4432,14 @@ requires-dist = [
     { name = "matplotlib", marker = "extra == 'bench'" },
     { name = "mcp" },
     { name = "mistral-common", extras = ["audio"], marker = "extra == 'audio'" },
-    { name = "mistral-common", extras = ["image"], specifier = ">=1.11.0" },
-    { name = "model-hosting-container-standards", specifier = ">=0.1.13,<1.0.0" },
+    { name = "mistral-common", extras = ["image"], specifier = ">=1.11.2" },
+    { name = "model-hosting-container-standards", specifier = ">=0.1.14,<1.0.0" },
     { name = "msgspec" },
     { name = "ninja" },
     { name = "numba", specifier = "==0.65.0" },
     { name = "numpy" },
     { name = "nvidia-cudnn-frontend", specifier = ">=1.13.0,<1.19.0" },
-    { name = "nvidia-cutlass-dsl", specifier = ">=4.4.2" },
+    { name = "nvidia-cutlass-dsl", specifier = "==4.5.0" },
     { name = "openai", specifier = ">=2.0.0" },
     { name = "openai-harmony", specifier = ">=0.0.3" },
     { name = "opencv-python-headless", specifier = ">=4.13.0" },
@@ -4403,6 +4483,7 @@ requires-dist = [
     { name = "tiktoken", specifier = ">=0.6.0" },
     { name = "tilelang", specifier = "==0.1.9" },
     { name = "tokenizers", specifier = ">=0.21.1" },
+    { name = "tokenspeed-mla", specifier = "==0.1.2" },
     { name = "torch", specifier = "==2.11.0" },
     { name = "torchaudio", specifier = "==2.11.0" },
     { name = "torchvision", specifier = "==0.26.0" },
@@ -4410,7 +4491,7 @@ requires-dist = [
     { name = "transformers", specifier = ">=4.56.0,!=5.0.*,!=5.1.*,!=5.2.*,!=5.3.*,!=5.4.*,!=5.5.0" },
     { name = "typing-extensions", specifier = ">=4.10" },
     { name = "watchfiles" },
-    { name = "xgrammar", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'ppc64le' or platform_machine == 's390x' or platform_machine == 'x86_64'", specifier = ">=0.1.32,<1.0.0" },
+    { name = "xgrammar", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'ppc64le' or platform_machine == 's390x' or platform_machine == 'x86_64'", specifier = ">=0.2.0,<1.0.0" },
     { name = "zentorch-weekly", marker = "extra == 'zen'", specifier = "==5.2.1.dev20260408" },
 ]
 provides-extras = ["zen", "bench", "tensorizer", "fastsafetensors", "instanttensor", "runai", "audio", "video", "flashinfer", "helion", "grpc", "otel"]
@@ -4418,82 +4499,82 @@ provides-extras = ["zen", "bench", "tensorizer", "fastsafetensors", "instanttens
 [[package]]
 name = "vllm"
 version = "0.20.2+cu129"
-source = { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_x86_64.whl" }
+source = { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_aarch64.whl" }
 resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+    "platform_machine == 'aarch64' and sys_platform == 'linux'",
 ]
 dependencies = [
-    { name = "aiohttp", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "anthropic", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "apache-tvm-ffi", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "blake3", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "cachetools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "cbor2", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "cloudpickle", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "compressed-tensors", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "depyf", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "diskcache", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "einops", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "fastapi", extra = ["standard"], marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "fastsafetensors", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "filelock", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "flashinfer-cubin", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "flashinfer-python", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "gguf", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "ijson", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "lark", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "llguidance", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "lm-format-enforcer", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "mcp", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "mistral-common", extra = ["image"], marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "model-hosting-container-standards", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "msgspec", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "ninja", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "numba", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "numpy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cudnn-frontend", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cutlass-dsl", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "openai", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "openai-harmony", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "opencv-python-headless", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "opentelemetry-api", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "opentelemetry-exporter-otlp", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "opentelemetry-sdk", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "opentelemetry-semantic-conventions-ai", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "outlines-core", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "partial-json-parser", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "pillow", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "prometheus-client", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "prometheus-fastapi-instrumentator", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "protobuf", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "psutil", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "py-cpuinfo", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "pybase64", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "pydantic", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "python-json-logger", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "pyyaml", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "pyzmq", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "quack-kernels", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "regex", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "requests", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "sentencepiece", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "setproctitle", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "setuptools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "six", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "tiktoken", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "tilelang", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "tokenizers", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "torch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "torchaudio", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "torchvision", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "tqdm", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "transformers", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "typing-extensions", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "watchfiles", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "xgrammar", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "aiohttp", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "anthropic", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "apache-tvm-ffi", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "blake3", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "cachetools", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "cbor2", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "cloudpickle", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "compressed-tensors", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "depyf", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "diskcache", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "einops", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "fastapi", extra = ["standard"], marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "fastsafetensors", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "filelock", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "flashinfer-cubin", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "flashinfer-python", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "gguf", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "ijson", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "lark", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "llguidance", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "lm-format-enforcer", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "mcp", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "mistral-common", version = "1.11.0", source = { registry = "https://pypi.org/simple" }, extra = ["image"], marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "model-hosting-container-standards", version = "0.1.13", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "msgspec", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "ninja", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "numba", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-frontend", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "nvidia-cutlass-dsl", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "openai", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "openai-harmony", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "opencv-python-headless", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "opentelemetry-api", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "opentelemetry-exporter-otlp", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "opentelemetry-sdk", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "opentelemetry-semantic-conventions-ai", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "outlines-core", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "partial-json-parser", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "prometheus-client", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "prometheus-fastapi-instrumentator", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "protobuf", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "psutil", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "py-cpuinfo", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "pybase64", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "pydantic", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "python-json-logger", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "pyyaml", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "pyzmq", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "quack-kernels", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "regex", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "requests", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "sentencepiece", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "setproctitle", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "setuptools", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "six", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "tiktoken", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "tilelang", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "tokenizers", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchaudio", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchvision", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "tqdm", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "transformers", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "watchfiles", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "xgrammar", version = "0.1.33", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_x86_64.whl", hash = "sha256:2f8c2bf2ac6d3d16f930535e66822abd71065468521884eb5b910225b2abef4b" },
+    { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_aarch64.whl", hash = "sha256:8a58a086c5c4ed2883eee36aaaf6b79c83463d02da3015454acf92afcc8e150e" },
 ]
 
 [package.metadata]
@@ -4592,8 +4673,8 @@ provides-extras = ["zen", "bench", "tensorizer", "fastsafetensors", "instanttens
 
 [[package]]
 name = "vllm-router"
-version = "0.1.22"
-source = { url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.22/vllm_router-0.1.22-cp38-abi3-manylinux_2_28_x86_64.whl" }
+version = "0.1.25"
+source = { url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.25/vllm_router-0.1.25-cp38-abi3-manylinux_2_28_x86_64.whl" }
 dependencies = [
     { name = "aiohttp", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "fastapi", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
@@ -4603,7 +4684,7 @@ dependencies = [
     { name = "uvicorn", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.22/vllm_router-0.1.22-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:6361a0387241e56932f3ba2e51af27f58d11a462e3187e58286b2f96056e4d15" },
+    { url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.25/vllm_router-0.1.25-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:e84e731a0779f820bfe3cf4ce78cea2d09993c0a6501c63bcda93826bcd21fd0" },
 ]
 
 [package.metadata]
@@ -4711,12 +4792,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/52/24/ab44c871b0f07f491e5d2ad12c9bd7358e527510618cb1b803a88e986db1/werkzeug-3.1.3-py3-none-any.whl", hash = "sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e", size = 224498, upload-time = "2024-11-08T15:52:16.132Z" },
 ]
 
-[[package]]
-name = "wget"
-version = "3.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip", hash = "sha256:35e630eca2aa50ce998b9b1a127bb26b30dfee573702782aa982f875e3f16061", size = 10857, upload-time = "2015-10-22T15:26:37.51Z" }
-
 [[package]]
 name = "widgetsnbextension"
 version = "4.0.14"
@@ -4744,18 +4819,40 @@ wheels = [
 name = "xgrammar"
 version = "0.1.33"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'aarch64' and sys_platform == 'linux'",
+]
 dependencies = [
-    { name = "numpy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "pydantic", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "torch", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "transformers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "typing-extensions", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "pydantic", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "transformers", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/db/43/e5dfddb1d2a4fccf3e3a88f103e88698cdefc3182f4e169a359ffe1c1794/xgrammar-0.1.33.tar.gz", hash = "sha256:8dbe5fc3d76651ab1fac7a68fc2a118b885fa0ec7189927fb6e0dce0081aea99", size = 2398956, upload-time = "2026-03-27T10:16:36.582Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/4e/04/43d4baca876f5ae1b45897ec30a59801a2da37f16da1fcd85f9555e4c125/xgrammar-0.1.33-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c803e60d791854c5d1f271ece7e1f34d73c82dd4a8b2a06b7af5331482a78ac", size = 42133168, upload-time = "2026-03-27T10:15:16.994Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/a8/672833a3cff027253793aa999401d8364896ebf396967e475c7a878b895f/xgrammar-0.1.33-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:52b8eaa533282a0efb0835db6998ae72e7b3c7875d7a52e360ffebff9b78c30a", size = 42205803, upload-time = "2026-03-27T10:15:21.599Z" },
+]
+
+[[package]]
+name = "xgrammar"
+version = "0.2.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "apache-tvm-ffi", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "numpy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "pydantic", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "torch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "transformers", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a0/54/7e593fc41ffcaf5ac7c0379e0aec0cf03e53a742d1a91f64c6c7e79a6ac1/xgrammar-0.2.0.tar.gz", hash = "sha256:c4f0238a89869343171d43d069b8c5da874f3c2c25f408f20cd5987219a6adef", size = 2421093, upload-time = "2026-05-01T18:33:54.474Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/30/99f4e83821db16d58dd41249ba46038ed47bce274c57ad5567030775fc62/xgrammar-0.2.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a36c744d24d93e178c138486aa02b390a80326b64ff11e222e063a028dd65849", size = 44616361, upload-time = "2026-05-01T18:32:42.536Z" },
 ]
 
 [[package]]