diff --git a/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py b/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py index 5d04d3369f..8feba60128 100644 --- a/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py +++ b/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py @@ -1269,6 +1269,10 @@ def validate_renderer_args(self): renderer_args_set.append(f"renderer.reasoning_parser={self.renderer.reasoning_parser!r}") if self.renderer.pool_size is not None: renderer_args_set.append(f"renderer.pool_size={self.renderer.pool_size!r}") + if self.renderer.preserve_all_thinking: + renderer_args_set.append("renderer.preserve_all_thinking=true") + if self.renderer.preserve_thinking_between_tool_calls: + renderer_args_set.append("renderer.preserve_thinking_between_tool_calls=true") if renderer_args_set: raise ValueError( diff --git a/packages/prime-rl-configs/src/prime_rl/configs/rl.py b/packages/prime-rl-configs/src/prime_rl/configs/rl.py index a160af2c9f..8d5f087da7 100644 --- a/packages/prime-rl-configs/src/prime_rl/configs/rl.py +++ b/packages/prime-rl-configs/src/prime_rl/configs/rl.py @@ -791,6 +791,19 @@ def auto_setup_router_replay(self): ) return self + @model_validator(mode="after") + def validate_router_replay_without_kv_offload(self): + if ( + self.trainer.enable_router_replay + and self.inference is not None + and self.inference.kv_cache_offload is not None + ): + raise ValueError( + "Router replay with inference.kv_cache_offload is not supported. " + "External KV cache hits do not carry routed-expert decisions." + ) + return self + @model_validator(mode="after") def auto_setup_deployment(self): if self.deployment.type == "single_node": # single-node diff --git a/packages/prime-rl-configs/src/prime_rl/configs/shared.py b/packages/prime-rl-configs/src/prime_rl/configs/shared.py index d26c33d9a9..1651e17b9f 100644 --- a/packages/prime-rl-configs/src/prime_rl/configs/shared.py +++ b/packages/prime-rl-configs/src/prime_rl/configs/shared.py @@ -186,6 +186,26 @@ class RendererConfig(BaseConfig): ), ] = None + preserve_all_thinking: Annotated[ + bool, + Field( + description=( + "Forward preserve_all_thinking to the renderer client. When true, " + "past-assistant reasoning_content is re-emitted on subsequent renders." + ), + ), + ] = False + + preserve_thinking_between_tool_calls: Annotated[ + bool, + Field( + description=( + "Forward preserve_thinking_between_tool_calls to the renderer client. " + "This preserves thinking only inside the active assistant/tool block." + ), + ), + ] = False + class ElasticConfig(BaseConfig): """Configures elastic inference pool with DNS-based service discovery. diff --git a/pyproject.toml b/pyproject.toml index d9b5468fa0..a54daaa387 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "torchaudio", "torchdata>=0.11.0", "transformers", - "vllm>=0.20.2", + "vllm", "wandb>=0.26.1", "ring-flash-attn>=0.1.8", "prime>=0.6.4", @@ -36,6 +36,7 @@ dependencies = [ "tilelang>=0.1.8", "flash-linear-attention", "nvidia-ml-py>=12.575.51", + "pybase64>=1.4.2", ] [project.scripts] @@ -130,6 +131,7 @@ override-dependencies = [ [tool.uv.exclude-newer-package] # we want latest vllm, remove next patch vllm = false +tokenspeed-mla = false flash_attn_3 = false # Self-vendored packages on our primeintellect index reverse-text = false @@ -166,15 +168,15 @@ prime-rl-configs = { workspace = true } torch = { index = "pytorch-cu128" } torchvision = { index = "pytorch-cu128" } torchaudio = { index = "pytorch-cu128" } -verifiers = { git = "https://github.com/PrimeIntellect-ai/verifiers.git", rev = "aa428f3" } +verifiers = { git = "https://github.com/PrimeIntellect-ai/verifiers", rev = "3708ede" } torchtitan = { git = "https://github.com/pytorch/torchtitan", rev = "a1fdd7e" } dion = { git = "https://github.com/samsja/dion.git", rev = "d891eeb" } transformers = { git = "https://github.com/huggingface/transformers.git", rev = "c1c3424" } flash-attn-4 = { git = "https://github.com/Dao-AILab/flash-attention.git", subdirectory = "flash_attn/cute", rev = "96bd151" } pydantic-config = { git = "https://github.com/samsja/pydantic_config.git", branch = "main" } -vllm-router = { url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.22/vllm_router-0.1.22-cp38-abi3-manylinux_2_28_x86_64.whl" } +vllm-router = { url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.25/vllm_router-0.1.25-cp38-abi3-manylinux_2_28_x86_64.whl" } vllm = [ - { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_x86_64.whl", marker = "platform_machine == 'x86_64'" }, + { url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/vllm-0.20.2rc1.dev354+g24337fb86.cu129-cp38-abi3-manylinux_2_34_x86_64.whl", marker = "platform_machine == 'x86_64'" }, { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_aarch64.whl", marker = "platform_machine == 'aarch64'" }, ] reverse-text = { index = "primeintellect" } diff --git a/src/prime_rl/inference/patches.py b/src/prime_rl/inference/patches.py index 974aed5f82..780086be08 100644 --- a/src/prime_rl/inference/patches.py +++ b/src/prime_rl/inference/patches.py @@ -19,6 +19,51 @@ def transformers_v5_compat(): monkey_patch_deep_gemm_silu_mul_quant_int64() monkey_patch_dp_engine_core_pause_resume_deadlock() monkey_patch_vllm_layerwise_reload_alias_buffers() + monkey_patch_return_routed_experts_with_nixl_connector() + + +def monkey_patch_return_routed_experts_with_nixl_connector(): + from vllm import envs + from vllm.config.vllm import VllmConfig + from vllm.logger import init_logger + + logger = init_logger(__name__) + original_post_init = VllmConfig.__post_init__ + + if getattr(original_post_init, "_prime_rl_allows_nixl_routed_experts", False): + return + + def _is_nixl_routed_experts_pd_config(config: VllmConfig) -> bool: + kv_transfer_config = config.kv_transfer_config + return ( + config.model_config is not None + and config.model_config.enable_return_routed_experts + and kv_transfer_config is not None + and kv_transfer_config.kv_connector == "NixlConnector" + and kv_transfer_config.is_kv_transfer_instance + ) + + def _post_init(config: VllmConfig): + if not _is_nixl_routed_experts_pd_config(config): + return original_post_init(config) + + if config.parallel_config.pipeline_parallel_size > 1: + raise ValueError("--enable-return-routed-experts is incompatible with pipeline parallelism (PP > 1).") + if envs.VLLM_USE_V2_MODEL_RUNNER: + raise ValueError("VLLM_USE_V2_MODEL_RUNNER does not yet support: routed experts capture") + + # vLLM rejects every KV connector, but our P/D path uses NIXL and + # stitches prefill/decode routed experts in the router. CPU KV offload + # remains rejected by prime-rl config validation. + config.model_config.enable_return_routed_experts = False + try: + return original_post_init(config) + finally: + config.model_config.enable_return_routed_experts = True + + _post_init._prime_rl_allows_nixl_routed_experts = True + VllmConfig.__post_init__ = _post_init + logger.warning("Enabled vLLM routed-experts capture with NIXL connector patch.") def monkey_patch_vllm_layerwise_reload_alias_buffers(): @@ -897,9 +942,9 @@ def monkey_patch_dp_engine_core_pause_resume_deadlock(): - on resume, wake every DP rank and force an immediate global unfinished sync instead of waiting for the normal 32-step cadence - This keeps the upstream pause-side fix from - https://github.com/vllm-project/vllm/pull/37024 and extends it with the - resume-side wave-state fix. + This also bypasses vLLM's two-phase DP pause implementation + (https://github.com/vllm-project/vllm/pull/39366), which makes resume + reject states that our weight-update flow can validly hit. """ from vllm.config import ParallelConfig from vllm.v1.core.sched.interface import PauseState @@ -909,7 +954,8 @@ def monkey_patch_dp_engine_core_pause_resume_deadlock(): _base_add_request = EngineCore.add_request _base_handle_client_request = EngineCoreProc._handle_client_request - _base_resume_scheduler = DPEngineCoreProc.resume_scheduler + _base_pause_complete = EngineCoreProc._pause_complete + _base_resume_scheduler = EngineCoreProc.resume_scheduler def _patched_add_request(self, request: Request, request_wave: int = 0): _base_add_request(self, request, request_wave) @@ -930,8 +976,15 @@ def _patched_handle_client_request(self, request_type, request): else: _base_handle_client_request(self, request_type, request) + def _patched_pause_complete(self) -> bool: + self.pending_pause = False + self.ignore_start_dp_wave = False + return _base_pause_complete(self) + def _patched_resume_scheduler(self): was_paused = self.scheduler.pause_state != PauseState.UNPAUSED + self.pending_pause = False + self.ignore_start_dp_wave = False _base_resume_scheduler(self) if was_paused: self.engines_running = True @@ -948,6 +1001,7 @@ def _patched_has_global_unfinished_reqs(self, local_unfinished: bool) -> bool: DPEngineCoreProc.add_request = _patched_add_request DPEngineCoreProc._handle_client_request = _patched_handle_client_request + DPEngineCoreProc._pause_complete = _patched_pause_complete DPEngineCoreProc.resume_scheduler = _patched_resume_scheduler DPEngineCoreProc._has_global_unfinished_reqs = _patched_has_global_unfinished_reqs diff --git a/src/prime_rl/inference/vllm/routed_experts.py b/src/prime_rl/inference/vllm/routed_experts.py new file mode 100644 index 0000000000..cad97e8574 --- /dev/null +++ b/src/prime_rl/inference/vllm/routed_experts.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from collections.abc import AsyncIterator +from typing import Any + +import numpy as np +import pybase64 +from vllm.outputs import RequestOutput + + +def serialize_routed_experts(routed_experts: Any) -> dict[str, Any] | None: + if routed_experts is None: + return None + + array = np.asarray(routed_experts) + assert array.ndim == 3 + assert np.issubdtype(array.dtype, np.integer) + if array.size: + assert array.min() >= 0 + assert array.max() <= np.iinfo(np.uint8).max + + compact = np.ascontiguousarray(array.astype(np.uint8, copy=False)) + return { + "data": pybase64.b64encode(memoryview(compact)).decode("ascii"), + "shape": list(compact.shape), + } + + +class RoutedExpertsCapture: + def __init__(self, generator: AsyncIterator[RequestOutput]): + self._generator = generator + self.routed_experts: dict[int, dict[str, Any]] = {} + + async def __aiter__(self): + async for request_output in self._generator: + for output in request_output.outputs: + encoded = serialize_routed_experts(getattr(output, "routed_experts", None)) + if encoded is not None: + self.routed_experts[output.index] = encoded + yield request_output diff --git a/src/prime_rl/inference/vllm/serving_chat_with_tokens.py b/src/prime_rl/inference/vllm/serving_chat_with_tokens.py index fae9465fbe..c78a76bde8 100644 --- a/src/prime_rl/inference/vllm/serving_chat_with_tokens.py +++ b/src/prime_rl/inference/vllm/serving_chat_with_tokens.py @@ -14,22 +14,11 @@ from vllm.reasoning import ReasoningParser from vllm.sampling_params import BeamSearchParams, SamplingParams -from prime_rl.inference.vllm.serving_tokens import _RoutedExpertsCaptureBase +from prime_rl.inference.vllm.routed_experts import RoutedExpertsCapture logger = init_logger(__name__) -class _RoutedExpertsCapture(_RoutedExpertsCaptureBase): - """Chat-endpoint variant: mutates choices in-place because - ``ChatCompletionResponseChoice`` is ``extra='allow'``, so an extra - ``routed_experts`` attribute survives serialization.""" - - def post_process(self, response: ChatCompletionResponse) -> None: - for choice in response.choices: - if choice.index in self.routed_experts: - choice.routed_experts = self.routed_experts[choice.index] - - class ChatCompletionRequestWithTokens(ChatCompletionRequest): field_names: ClassVar[Optional[set[str]]] = None tokens: list[int] = Field(description=("Prompt tokens to use for the request.")) @@ -55,11 +44,10 @@ async def chat_completion_full_generator( # 1. We create a custom generator that encapsulates the original result_generator in self._generator # 2. We override it's __aiter__ method to also capture the routed experts as an extra field in ChatCompletionResponse.choices # 3. We override the full_generator method to use the custom generator instead of the original one if expert routing is enabled + capture = None if self.model_config.enable_return_routed_experts: - capture = _RoutedExpertsCapture(result_generator) + capture = RoutedExpertsCapture(result_generator) result_generator = capture - else: - capture = None response = await super().chat_completion_full_generator( request, @@ -72,8 +60,10 @@ async def chat_completion_full_generator( reasoning_parser, ) - if capture and isinstance(response, ChatCompletionResponse): - capture.post_process(response) + if capture is not None and isinstance(response, ChatCompletionResponse): + for choice in response.choices: + if choice.index in capture.routed_experts: + choice.routed_experts = capture.routed_experts[choice.index] return response diff --git a/src/prime_rl/inference/vllm/serving_tokens.py b/src/prime_rl/inference/vllm/serving_tokens.py index 359df83d11..789b361c19 100644 --- a/src/prime_rl/inference/vllm/serving_tokens.py +++ b/src/prime_rl/inference/vllm/serving_tokens.py @@ -10,9 +10,9 @@ header and forwarded to ``engine_client.generate``. The DP-replicated inference servers prime-RL runs need this to target a specific replica. -2. ``routed_experts`` per-token export — when the engine emits routing - decisions (``enable_return_routed_experts``), surface them on each choice. - This is what the trainer's router-replay path consumes. +2. Compact ``routed_experts`` export — when the engine emits routing + decisions, surface them as base64 raw-byte payloads without requiring a vLLM + source fork. 3. Server-side ``max_tokens`` defaulting — ``ServingTokens`` hands the client-supplied ``SamplingParams`` to the engine verbatim, and @@ -30,13 +30,11 @@ from __future__ import annotations -import base64 from collections.abc import AsyncGenerator from functools import cached_property +from typing import Any -import numpy as np from fastapi import Request -from pydantic import Field from vllm.entrypoints.openai.engine.protocol import ErrorResponse, RequestResponseMetadata from vllm.entrypoints.serve.disagg.protocol import ( GenerateRequest, @@ -48,64 +46,29 @@ from vllm.outputs import RequestOutput from vllm.sampling_params import RequestOutputKind, SamplingParams +from prime_rl.inference.vllm.routed_experts import RoutedExpertsCapture + class PrimeRlGenerateResponseChoice(GenerateResponseChoice): - routed_experts: dict | None = Field( - default=None, - description=( - "Per-token expert routing decisions (base85-encoded int32 array + shape). " - "Populated only when the engine was launched with " - "``enable_return_routed_experts=True``; otherwise ``None``." - ), - ) + routed_experts: dict[str, Any] | None = None class PrimeRlGenerateResponse(GenerateResponse): choices: list[PrimeRlGenerateResponseChoice] -def encode_routed_experts(arr: np.ndarray) -> dict: - return { - "data": base64.b85encode(arr.tobytes()).decode("ascii"), - "shape": list(arr.shape), - } - - -class _RoutedExpertsCaptureBase: - """Wraps the engine result generator and accumulates a - ``{output_index: encoded_experts}`` map as outputs stream. Subclasses - implement ``post_process`` to fold the captured map into the response - in whatever shape the endpoint returns (in-place vs rebuilt).""" - - def __init__(self, generator: AsyncGenerator[RequestOutput, None]): - self._generator = generator - self.routed_experts: dict[int, dict] = {} - - async def __aiter__(self): - async for request_output in self._generator: - for output in request_output.outputs: - if output.routed_experts is not None: - self.routed_experts[output.index] = encode_routed_experts(output.routed_experts) - yield request_output - - -class _RoutedExpertsCapture(_RoutedExpertsCaptureBase): - """Generate-endpoint variant: rebuilds the response with - ``PrimeRlGenerateResponseChoice`` because upstream's - ``GenerateResponseChoice`` isn't ``extra='allow'``, so an attribute - set after construction wouldn't survive serialization.""" - +class _GenerateRoutedExpertsCapture(RoutedExpertsCapture): def post_process(self, response: GenerateResponse) -> PrimeRlGenerateResponse: - new_choices = [ + choices = [ PrimeRlGenerateResponseChoice( - **choice.model_dump(), + **choice.model_dump(exclude={"routed_experts"}), routed_experts=self.routed_experts.get(choice.index), ) for choice in response.choices ] return PrimeRlGenerateResponse( request_id=response.request_id, - choices=new_choices, + choices=choices, prompt_logprobs=response.prompt_logprobs, kv_transfer_params=response.kv_transfer_params, ) @@ -135,7 +98,7 @@ async def _client_set_max_tokens(raw_request: Request | None) -> bool: class PrimeRlServingTokens(ServingTokens): - """ServingTokens + DP-rank routing + routed_experts export + max_tokens defaulting.""" + """ServingTokens + DP-rank routing + compact routed experts + max_tokens defaulting.""" @cached_property def _max_tokens_defaults(self) -> tuple[dict, int | None]: @@ -306,10 +269,10 @@ async def serve_tokens_full_generator( # type: ignore[override] # encoded experts surface in the JSON. Skipping the wrapper when the # engine isn't producing routed experts keeps us a no-op subclass on # the common path. - capture: _RoutedExpertsCapture | None = None + capture: _GenerateRoutedExpertsCapture | None = None if self.model_config.enable_return_routed_experts: - capture = _RoutedExpertsCapture(result_generator) - result_generator = capture # type: ignore[assignment] + capture = _GenerateRoutedExpertsCapture(result_generator) + result_generator = capture response = await super().serve_tokens_full_generator( request, result_generator, request_id, model_name, request_metadata diff --git a/src/prime_rl/orchestrator/orchestrator.py b/src/prime_rl/orchestrator/orchestrator.py index bc1128ebc7..67ef7bfa1d 100644 --- a/src/prime_rl/orchestrator/orchestrator.py +++ b/src/prime_rl/orchestrator/orchestrator.py @@ -926,6 +926,8 @@ async def setup_rollout_inference_pool( renderer=config.renderer.name, tool_parser=config.renderer.tool_parser, reasoning_parser=config.renderer.reasoning_parser, + preserve_all_thinking=config.renderer.preserve_all_thinking, + preserve_thinking_between_tool_calls=config.renderer.preserve_thinking_between_tool_calls, ) logger.info(f"Initialized {type(renderer).__name__} for {config.model.name}") inference_pool = await setup_inference_pool( @@ -937,6 +939,8 @@ async def setup_rollout_inference_pool( tool_parser=config.renderer.tool_parser, reasoning_parser=config.renderer.reasoning_parser, renderer_pool_size=config.renderer.pool_size, + preserve_all_thinking=config.renderer.preserve_all_thinking, + preserve_thinking_between_tool_calls=config.renderer.preserve_thinking_between_tool_calls, ) logger.info("Using direct renderer rollout client") return renderer, inference_pool diff --git a/src/prime_rl/orchestrator/trajectories.py b/src/prime_rl/orchestrator/trajectories.py index 3a45ee9ada..4cd6f5643c 100644 --- a/src/prime_rl/orchestrator/trajectories.py +++ b/src/prime_rl/orchestrator/trajectories.py @@ -6,12 +6,15 @@ from pathlib import Path from typing import Any +import numpy as np +import pybase64 import torch import verifiers as vf from PIL import Image from transformers.tokenization_utils import PreTrainedTokenizer from prime_rl.transport import TrainingSample +from prime_rl.transport.types import RoutedExperts from prime_rl.utils.chat_template import ( common_prefix_len, deserialize_tool_calls, @@ -25,25 +28,53 @@ # primitives are immutable. pixel_values/image_grid_thw are not mutated after creation. +def _decode_routed_experts(payload: dict[str, Any] | None) -> np.ndarray | None: + if payload is None: + return None + shape = [int(dim) for dim in payload["shape"]] + decoded = pybase64.b64decode_as_bytearray(payload["data"]) + expected_size = int(np.prod(shape, dtype=np.int64)) + assert len(decoded) == expected_size, (len(decoded), expected_size, shape) + routed_experts = np.frombuffer(decoded, dtype=np.uint8).reshape(shape) + assert routed_experts.ndim == 3 + return routed_experts + + def _align_routed_experts( - routed_experts: list[list[list[int]]] | None, + routed_experts: np.ndarray | None, expected_len: int, -) -> list[list[list[int]]] | None: +) -> np.ndarray | None: """Align routed_experts length with the expected token count. VLLM's capturer uses `num_tokens - 1` slot mappings because the final generated token was never fed as input to a forward pass and has no routing decision. Append zero-filled entries for the missing positions. """ - if routed_experts is None or not routed_experts: + if routed_experts is None: return routed_experts - deficit = expected_len - len(routed_experts) + assert routed_experts.ndim == 3 + if routed_experts.shape[0] > expected_len: + return np.ascontiguousarray(routed_experts[:expected_len]) + deficit = expected_len - routed_experts.shape[0] if deficit <= 0: return routed_experts - num_layers = len(routed_experts[0]) - topk = len(routed_experts[0][0]) - zero_entry = [[0] * topk for _ in range(num_layers)] - return routed_experts + [zero_entry for _ in range(deficit)] + padding = np.zeros((deficit, routed_experts.shape[1], routed_experts.shape[2]), dtype=routed_experts.dtype) + return np.concatenate((routed_experts, padding), axis=0) + + +def _pack_routed_experts(routed_experts: np.ndarray | None) -> RoutedExperts | None: + if routed_experts is None: + return None + routed_experts = np.ascontiguousarray(routed_experts) + return RoutedExperts( + data=routed_experts.tobytes(), + shape=list(routed_experts.shape), + dtype=str(routed_experts.dtype), + ) + + +def _unpack_routed_experts(routed_experts: RoutedExperts) -> np.ndarray: + return np.frombuffer(routed_experts.data, dtype=np.dtype(routed_experts.dtype)).reshape(routed_experts.shape).copy() def _common_prefix_len(a: list[int], b: list[int]) -> int: @@ -296,13 +327,14 @@ def interleave_rollout( def prepare_step_tokens(step: vf.TrajectoryStep, step_idx: int) -> dict[str, Any] | None: tokens = step["tokens"] if tokens is not None: + routed_experts = _decode_routed_experts(tokens.get("routed_experts")) return { "prompt_ids": list(tokens["prompt_ids"]), "prompt_mask": [bool(i) for i in tokens["prompt_mask"]], "completion_ids": list(tokens["completion_ids"]), "completion_mask": [bool(i) for i in tokens["completion_mask"]], "completion_logprobs": list(tokens["completion_logprobs"]), - "routed_experts": tokens.get("routed_experts"), + "routed_experts": routed_experts, } logger.warning(f"Missing rollout tokens for example {output['example_id']} step {step_idx}.") @@ -328,7 +360,7 @@ def make_sample(tokens: dict[str, Any]) -> TrainingSample: len(tokens["prompt_ids"]) + len(tokens["completion_ids"]), ) prompt_ids = list(tokens["prompt_ids"]) - return TrainingSample( + sample = TrainingSample( prompt_ids=prompt_ids, prompt_mask=[bool(i) for i in tokens["prompt_mask"]], completion_ids=completion_ids, @@ -337,9 +369,10 @@ def make_sample(tokens: dict[str, Any]) -> TrainingSample: completion_temperatures=[temperature] * len(completion_ids), teacher_logprobs=None, advantage=None, - routed_experts=routed_experts, + routed_experts=_pack_routed_experts(routed_experts), mm_token_type_ids=None, ) + return sample def extend_sample(sample: TrainingSample, prefix_len: int, step_idx: int) -> None: """Extend an existing sample with a new trajectory step (extension property holds).""" @@ -364,15 +397,17 @@ def extend_sample(sample: TrainingSample, prefix_len: int, step_idx: int) -> Non if tokens.get("routed_experts") is not None and sample.routed_experts is not None: step_routed = tokens["routed_experts"] + sample_routed_experts = _unpack_routed_experts(sample.routed_experts) # The previous step's last routing entry was zero-padded by _align_routed_experts # (vLLM only captures num_tokens-1 routings per request). This step actually # processed that boundary token as part of its prompt, so replace the zero-fill # with the real routing decision before appending new entries. - if prefix_len > 0 and prefix_len <= len(step_routed): - sample.routed_experts[prefix_len - 1] = step_routed[prefix_len - 1] - sample.routed_experts.extend(step_routed[prefix_len:]) + if prefix_len > 0 and prefix_len <= step_routed.shape[0]: + sample_routed_experts[prefix_len - 1] = step_routed[prefix_len - 1] + sample_routed_experts = np.concatenate((sample_routed_experts, step_routed[prefix_len:]), axis=0) expected_len = len(sample.prompt_ids) + len(sample.completion_ids) - sample.routed_experts = _align_routed_experts(sample.routed_experts, expected_len) + sample_routed_experts = _align_routed_experts(sample_routed_experts, expected_len) + sample.routed_experts = _pack_routed_experts(sample_routed_experts) # Track [prefix_tokens, sample, last_step_idx] per active sample active_samples: list[tuple[list[int], TrainingSample, int]] = [] diff --git a/src/prime_rl/trainer/batch.py b/src/prime_rl/trainer/batch.py index 662df36a80..ca248a43d4 100644 --- a/src/prime_rl/trainer/batch.py +++ b/src/prime_rl/trainer/batch.py @@ -1,6 +1,52 @@ import copy -from prime_rl.transport.types import MicroBatch, TrainingSample +from prime_rl.transport.types import MicroBatch, RoutedExperts, TrainingSample + +ROUTED_EXPERTS_DTYPE_ITEMSIZE = { + "uint8": 1, + "int16": 2, + "int32": 4, +} + + +def _copy_routed_experts(routed_experts: RoutedExperts) -> RoutedExperts: + return RoutedExperts( + data=routed_experts.data, + shape=list(routed_experts.shape), + dtype=routed_experts.dtype, + ) + + +def _routed_experts_row_size(routed_experts: RoutedExperts) -> int: + return routed_experts.shape[1] * routed_experts.shape[2] * ROUTED_EXPERTS_DTYPE_ITEMSIZE[routed_experts.dtype] + + +def _slice_routed_experts(routed_experts: RoutedExperts, seq_len: int) -> RoutedExperts: + row_size = _routed_experts_row_size(routed_experts) + return RoutedExperts( + data=routed_experts.data[: seq_len * row_size], + shape=[seq_len, routed_experts.shape[1], routed_experts.shape[2]], + dtype=routed_experts.dtype, + ) + + +def _append_routed_experts(dst: MicroBatch, src: MicroBatch) -> None: + dst_routed = dst.routed_experts + src_routed = src.routed_experts + assert dst_routed is not None + assert src_routed is not None + assert dst_routed.dtype == src_routed.dtype + assert dst_routed.shape[1:] == src_routed.shape[1:] + dst_routed.data += src_routed.data + dst_routed.shape[0] += src_routed.shape[0] + + +def _pad_routed_experts(micro_batch: MicroBatch, padding_size: int) -> None: + routed_experts = micro_batch.routed_experts + assert routed_experts is not None + row_size = _routed_experts_row_size(routed_experts) + routed_experts.data += b"\0" * (padding_size * row_size) + routed_experts.shape[0] += padding_size def prepare_sample(training_example: TrainingSample, seq_len: int) -> MicroBatch: @@ -23,7 +69,9 @@ def prepare_sample(training_example: TrainingSample, seq_len: int) -> MicroBatch # Teacher logprobs already cover the full sequence (prompt + completion), # computed via prefill in the orchestrator when a teacher model is configured teacher_logprobs = training_example.teacher_logprobs - routed_experts = training_example.routed_experts + routed_experts = ( + _copy_routed_experts(training_example.routed_experts) if training_example.routed_experts is not None else None + ) if len(input_ids) > seq_len: input_ids = input_ids[:seq_len] @@ -35,7 +83,7 @@ def prepare_sample(training_example: TrainingSample, seq_len: int) -> MicroBatch if teacher_logprobs is not None: teacher_logprobs = teacher_logprobs[:seq_len] if routed_experts is not None: - routed_experts = routed_experts[:seq_len] + routed_experts = _slice_routed_experts(routed_experts, seq_len) if mm_token_type_ids is not None: mm_token_type_ids = mm_token_type_ids[:seq_len] @@ -53,9 +101,10 @@ def prepare_sample(training_example: TrainingSample, seq_len: int) -> MicroBatch assert len(teacher_logprobs) == len(input_ids), f"teacher_logprobs: {len(teacher_logprobs)}" if routed_experts is not None: - assert len(routed_experts) == len(input_ids), ( - f"routed_experts: {len(routed_experts)}, input_ids: {len(input_ids)}" + assert routed_experts.shape[0] == len(input_ids), ( + f"routed_experts: {routed_experts.shape}, input_ids: {len(input_ids)}" ) + assert len(routed_experts.data) == len(input_ids) * _routed_experts_row_size(routed_experts) if mm_token_type_ids is not None: assert len(mm_token_type_ids) == len(input_ids), ( @@ -129,10 +178,9 @@ def packed_samples_into_micro_bs( if bin_content.teacher_logprobs is None: bin_content.teacher_logprobs = [] bin_content.teacher_logprobs.extend(sample.teacher_logprobs) + assert (bin_content.routed_experts is None) == (sample.routed_experts is None) if sample.routed_experts is not None: - if bin_content.routed_experts is None: - bin_content.routed_experts = [] - bin_content.routed_experts.extend(sample.routed_experts) + _append_routed_experts(bin_content, sample) if sample.mm_token_type_ids is not None: if bin_content.mm_token_type_ids is None: bin_content.mm_token_type_ids = [] @@ -178,6 +226,8 @@ def pad_micro_batch(micro_batch: MicroBatch, pad_to_multiple_of: int) -> MicroBa ) if micro_batch.mm_token_type_ids is not None: micro_batch.mm_token_type_ids.extend([0] * padding_size) + if micro_batch.routed_experts is not None: + _pad_routed_experts(micro_batch, padding_size) return micro_batch diff --git a/src/prime_rl/trainer/rl/data.py b/src/prime_rl/trainer/rl/data.py index ffc4bc627f..cabd126f59 100644 --- a/src/prime_rl/trainer/rl/data.py +++ b/src/prime_rl/trainer/rl/data.py @@ -12,6 +12,12 @@ from prime_rl.trainer.world import get_world from prime_rl.transport import MicroBatch, MicroBatchReceiver, TransportConfig, setup_micro_batch_receiver +ROUTED_EXPERTS_TORCH_DTYPES = { + "uint8": torch.uint8, + "int16": torch.int16, + "int32": torch.int32, +} + class TensorMicroBatch(TypedDict): """A micro batch of data for training.""" @@ -195,6 +201,18 @@ def _micro_batch_to_tensor(self, micro_batch: MicroBatch) -> TensorMicroBatch: if micro_batch.lora_num_tokens is None: micro_batch.lora_num_tokens = [0] * self.multi_run_manager.max_runs micro_batch.lora_num_tokens[0] = len(micro_batch.input_ids) + routed_experts = None + packed_routed_experts = micro_batch.routed_experts + if packed_routed_experts is not None: + routed_experts = ( + torch.frombuffer( + packed_routed_experts.data, + dtype=ROUTED_EXPERTS_TORCH_DTYPES[packed_routed_experts.dtype], + ) + .reshape(packed_routed_experts.shape) + .to(torch.int32) + .unsqueeze(0) + ) return TensorMicroBatch( input_ids=torch.tensor(micro_batch.input_ids, dtype=torch.long).unsqueeze(0), position_ids=torch.tensor(micro_batch.position_ids, dtype=torch.long).unsqueeze(0), @@ -218,10 +236,6 @@ def _micro_batch_to_tensor(self, micro_batch: MicroBatch) -> TensorMicroBatch: mm_token_type_ids=torch.tensor(micro_batch.mm_token_type_ids, dtype=torch.long).unsqueeze(0) if micro_batch.mm_token_type_ids is not None else None, - routed_experts=torch.tensor(micro_batch.routed_experts, dtype=torch.int32).unsqueeze( - 0 - ) # [1, seq_len, layers, topk] - if micro_batch.routed_experts is not None - else None, + routed_experts=routed_experts, sft_loss=micro_batch.sft_loss, ) diff --git a/src/prime_rl/transport/types.py b/src/prime_rl/transport/types.py index 4bc594f06d..cc943e9b76 100644 --- a/src/prime_rl/transport/types.py +++ b/src/prime_rl/transport/types.py @@ -1,6 +1,14 @@ import msgspec +# Routed experts are large per-token arrays. tolist() is too expensive, so we +# send raw bytes through msgpack and carry the shape/dtype needed to rebuild. +class RoutedExperts(msgspec.Struct, array_like=True, gc=False, omit_defaults=True): + data: bytes + shape: list[int] # [seq_len, layers, topk] + dtype: str + + # Orchestrator -> Packer class TrainingSample(msgspec.Struct, array_like=True, gc=False, omit_defaults=True): """A single training example.""" @@ -21,7 +29,7 @@ class TrainingSample(msgspec.Struct, array_like=True, gc=False, omit_defaults=Tr # image_grid_thw: grid dimensions [num_images, 3] where each entry is [temporal, height, width] image_grid_thw: list[list[int]] | None = None - routed_experts: list[list[list[int]]] | None = None # [seq_len, layers, topk] + routed_experts: RoutedExperts | None = None # mm_token_type_ids: token type ids per token [batch seq], int64 (0=text, 1=image, 2=video) mm_token_type_ids: list[int] | None = None @@ -49,7 +57,7 @@ class MicroBatch(msgspec.Struct, array_like=True, gc=False, omit_defaults=True): temperatures: list[float] # Per-token temperatures used during generation teacher_logprobs: list[float] | None = None lora_num_tokens: list[int] | None = None - routed_experts: list[list[list[int]]] | None = None + routed_experts: RoutedExperts | None = None # Multimodal fields (Qwen3-VL) — pixel_values stored as raw float32 bytes for efficient serialization pixel_values: bytes | None = None diff --git a/src/prime_rl/utils/client.py b/src/prime_rl/utils/client.py index 21659dfc46..fedbdddb8e 100644 --- a/src/prime_rl/utils/client.py +++ b/src/prime_rl/utils/client.py @@ -68,6 +68,8 @@ def __init__( tool_parser: str | None = None, reasoning_parser: str | None = None, renderer_pool_size: int | None = None, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, ): renderer_model_name = model_name if train_client_type == "renderer" else None self._train_clients = setup_clients( @@ -78,6 +80,8 @@ def __init__( tool_parser=tool_parser, reasoning_parser=reasoning_parser, renderer_pool_size=renderer_pool_size, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, ) self._eval_clients = setup_clients(client_config, client_type=eval_client_type) self._admin_clients = setup_admin_clients(client_config) @@ -129,6 +133,8 @@ async def setup_inference_pool( tool_parser: str | None = None, reasoning_parser: str | None = None, renderer_pool_size: int | None = None, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, ) -> InferencePool: """Create an inference pool from config (static or elastic).""" logger = get_logger() @@ -152,6 +158,8 @@ async def setup_inference_pool( tool_parser=tool_parser, reasoning_parser=reasoning_parser, renderer_pool_size=renderer_pool_size, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, ) logger.info( @@ -168,6 +176,8 @@ async def setup_inference_pool( tool_parser=tool_parser, reasoning_parser=reasoning_parser, renderer_pool_size=renderer_pool_size, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, ) @@ -179,6 +189,8 @@ def setup_clients( tool_parser: str | None = None, reasoning_parser: str | None = None, renderer_pool_size: int | None = None, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, ) -> list[vf.ClientConfig]: clients = [] client_idx = 0 @@ -196,6 +208,8 @@ def setup_clients( renderer_pool_size=renderer_pool_size, tool_parser=tool_parser, reasoning_parser=reasoning_parser, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, api_base_url=base_url, api_key_var=client_config.api_key_var, timeout=client_config.timeout, diff --git a/src/prime_rl/utils/elastic.py b/src/prime_rl/utils/elastic.py index 902f873903..c59f81e27f 100644 --- a/src/prime_rl/utils/elastic.py +++ b/src/prime_rl/utils/elastic.py @@ -110,6 +110,8 @@ def __init__( tool_parser: str | None = None, reasoning_parser: str | None = None, renderer_pool_size: int | None = None, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, ): self.logger = get_logger() self.client_config = client_config @@ -125,6 +127,8 @@ def __init__( self.tool_parser = tool_parser self.reasoning_parser = reasoning_parser self.renderer_pool_size = renderer_pool_size + self.preserve_all_thinking = preserve_all_thinking + self.preserve_thinking_between_tool_calls = preserve_thinking_between_tool_calls self.router_url = client_config.router_url self._servers: dict[str, ServerState] = {} @@ -152,6 +156,8 @@ async def from_config( tool_parser: str | None = None, reasoning_parser: str | None = None, renderer_pool_size: int | None = None, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, ) -> ElasticInferencePool: if client_config.elastic is None: raise ValueError("Elastic inference pool requires elastic config") @@ -164,6 +170,8 @@ async def from_config( tool_parser=tool_parser, reasoning_parser=reasoning_parser, renderer_pool_size=renderer_pool_size, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, ) await pool.start() return pool @@ -214,6 +222,8 @@ def _rebuild_clients(self) -> None: tool_parser=self.tool_parser, reasoning_parser=self.reasoning_parser, renderer_pool_size=self.renderer_pool_size, + preserve_all_thinking=self.preserve_all_thinking, + preserve_thinking_between_tool_calls=self.preserve_thinking_between_tool_calls, ) if urls else [] diff --git a/tests/unit/inference/test_serving_tokens.py b/tests/unit/inference/test_serving_tokens.py index ac5b52b3d4..1882e57e55 100644 --- a/tests/unit/inference/test_serving_tokens.py +++ b/tests/unit/inference/test_serving_tokens.py @@ -3,8 +3,7 @@ The full happy-path is owned upstream by vLLM 0.20's ``vllm/entrypoints/serve/disagg`` test suite. We only cover the prime-RL deltas here: - * ``encode_routed_experts`` round-trips a numpy array as expected. - * ``PrimeRlGenerateResponseChoice`` accepts the optional field. + * ``serialize_routed_experts`` round-trips a compact raw-byte payload. * The subclass attaches its overrides without monkey-patching the parent. * ``_client_set_max_tokens`` distinguishes raw-body shapes correctly. """ @@ -12,19 +11,26 @@ from __future__ import annotations import asyncio -import base64 import numpy as np +import pybase64 +from vllm.entrypoints.serve.disagg.protocol import GenerateResponse, GenerateResponseChoice +from prime_rl.inference.vllm.routed_experts import serialize_routed_experts from prime_rl.inference.vllm.serving_tokens import ( - PrimeRlGenerateResponse, - PrimeRlGenerateResponseChoice, PrimeRlServingTokens, _client_set_max_tokens, - encode_routed_experts, + _GenerateRoutedExpertsCapture, ) +def _decode_routed_experts(encoded: dict) -> np.ndarray: + return np.frombuffer( + pybase64.b64decode_as_bytearray(encoded["data"]), + dtype=np.uint8, + ).reshape(encoded["shape"]) + + class _FakeRawRequest: def __init__(self, body): self._body = body @@ -36,50 +42,54 @@ async def json(self): return self._body -def test_encode_routed_experts_roundtrip(): - arr = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32) - encoded = encode_routed_experts(arr) - - assert encoded["shape"] == [2, 3] - decoded = np.frombuffer(base64.b85decode(encoded["data"]), dtype=np.int32).reshape(encoded["shape"]) - np.testing.assert_array_equal(decoded, arr) - +async def _empty_request_outputs(): + if False: + yield -def test_routed_experts_choice_accepts_none_and_dict(): - no_re = PrimeRlGenerateResponseChoice(index=0, finish_reason="stop", token_ids=[1, 2]) - assert no_re.routed_experts is None - encoded = encode_routed_experts(np.zeros((1, 1), dtype=np.int32)) - with_re = PrimeRlGenerateResponseChoice(index=0, finish_reason="stop", token_ids=[1], routed_experts=encoded) - assert with_re.routed_experts == encoded - - -def test_response_only_serializes_declared_fields(): - # Upstream silently drops id=/created=/model=/usage= because they're not - # declared on GenerateResponse. Our subclass adds nothing to that surface - # — it only widens the choices type — so the JSON shape stays slim. - resp = PrimeRlGenerateResponse( - request_id="gen-x", - choices=[PrimeRlGenerateResponseChoice(index=0, finish_reason="stop", token_ids=[7])], - ) - dumped = resp.model_dump() - assert set(dumped.keys()) == { - "request_id", - "choices", - "prompt_logprobs", - "kv_transfer_params", - } - assert dumped["choices"][0]["routed_experts"] is None - - -def test_subclass_inherits_serve_tokens_full_generator(): - # The subclass adds an override; make sure we didn't accidentally rebind - # ``serve_tokens`` to a parent attribute via __dict__-update tricks later. +def test_subclass_only_overrides_serve_tokens(): + assert PrimeRlServingTokens.serve_tokens is not PrimeRlServingTokens.__mro__[1].serve_tokens assert ( PrimeRlServingTokens.serve_tokens_full_generator is not PrimeRlServingTokens.__mro__[1].serve_tokens_full_generator ) - assert PrimeRlServingTokens.serve_tokens is not PrimeRlServingTokens.__mro__[1].serve_tokens + + +def test_serialize_routed_experts_uses_compact_raw_payload(): + routed_experts = np.array( + [ + [[1, 2], [3, 4]], + [[5, 6], [7, 8]], + ], + dtype=np.int64, + ) + + encoded = serialize_routed_experts(routed_experts) + assert encoded is not None + + decoded = _decode_routed_experts(encoded) + assert decoded.dtype == np.uint8 + np.testing.assert_array_equal(decoded, routed_experts) + + +def test_generate_response_post_process_replaces_upstream_routed_experts(): + compact_routed_experts = {"data": "AQID", "shape": [1, 1, 3]} + capture = _GenerateRoutedExpertsCapture(_empty_request_outputs()) + capture.routed_experts[0] = compact_routed_experts + response = GenerateResponse( + request_id="request-id", + choices=[ + GenerateResponseChoice( + index=0, + token_ids=[1, 2, 3], + routed_experts="upstream-npy-payload", + ) + ], + ) + + processed = capture.post_process(response) + + assert processed.choices[0].routed_experts == compact_routed_experts def test_client_set_max_tokens_recognizes_explicit_value(): diff --git a/tests/unit/orchestrator/test_batch.py b/tests/unit/orchestrator/test_batch.py index a2e2e50079..fc95de4e2f 100644 --- a/tests/unit/orchestrator/test_batch.py +++ b/tests/unit/orchestrator/test_batch.py @@ -1,7 +1,17 @@ +import numpy as np import pytest from prime_rl.trainer.batch import prepare_batch, prepare_sample -from prime_rl.transport.types import TrainingSample +from prime_rl.transport.types import RoutedExperts, TrainingSample + + +def _routed_experts(data, dtype=np.uint8): + routed_experts = np.asarray(data, dtype=dtype) + return RoutedExperts( + data=routed_experts.tobytes(), + shape=list(routed_experts.shape), + dtype=str(routed_experts.dtype), + ) @pytest.fixture @@ -109,6 +119,7 @@ def test_prepare_sample_with_routed_experts(): """Routed experts are passed through prepare_sample and match input_ids length.""" # 2 prompt + 2 completion = 4 tokens, 2 layers, topk=2 routed_experts = [[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[0, 2], [1, 3]], [[1, 0], [3, 2]]] + routed = _routed_experts(routed_experts) sample = TrainingSample( prompt_ids=[1, 2], prompt_mask=[False, False], @@ -117,18 +128,21 @@ def test_prepare_sample_with_routed_experts(): completion_logprobs=[-0.1, -0.2], completion_temperatures=[1.0, 1.0], advantage=1.0, - routed_experts=routed_experts, + routed_experts=routed, ) micro_batch = prepare_sample(sample, seq_len=8) assert micro_batch.routed_experts is not None - assert len(micro_batch.routed_experts) == 4 - assert micro_batch.routed_experts == routed_experts + assert micro_batch.routed_experts.data == routed.data + assert micro_batch.routed_experts.shape == routed.shape + assert micro_batch.routed_experts.dtype == routed.dtype def test_prepare_sample_truncates_routed_experts(): """Routed experts are truncated to seq_len when input exceeds it.""" routed_experts = [[[0, 1]], [[2, 3]], [[4, 5]], [[6, 7]]] + routed = _routed_experts(routed_experts) + expected = _routed_experts(routed_experts[:3]) sample = TrainingSample( prompt_ids=[1, 2], prompt_mask=[False, False], @@ -137,13 +151,14 @@ def test_prepare_sample_truncates_routed_experts(): completion_logprobs=[-0.1, -0.2], completion_temperatures=[1.0, 1.0], advantage=1.0, - routed_experts=routed_experts, + routed_experts=routed, ) micro_batch = prepare_sample(sample, seq_len=3) assert micro_batch.routed_experts is not None - assert len(micro_batch.routed_experts) == 3 - assert micro_batch.routed_experts == routed_experts[:3] + assert micro_batch.routed_experts.data == expected.data + assert micro_batch.routed_experts.shape == expected.shape + assert micro_batch.routed_experts.dtype == expected.dtype def test_prepare_sample_none_routed_experts(): diff --git a/tests/unit/orchestrator/test_orchestrator_setup.py b/tests/unit/orchestrator/test_orchestrator_setup.py index ff9bb5b79f..5c5b420fc5 100644 --- a/tests/unit/orchestrator/test_orchestrator_setup.py +++ b/tests/unit/orchestrator/test_orchestrator_setup.py @@ -50,6 +50,8 @@ async def run() -> None: tool_parser=None, reasoning_parser=None, pool_size=None, + preserve_all_thinking=True, + preserve_thinking_between_tool_calls=False, ), ) rollout_client_config = SimpleNamespace(base_url=["http://localhost:8000/v1"]) @@ -79,6 +81,8 @@ async def run() -> None: renderer="qwen3_vl", tool_parser=None, reasoning_parser=None, + preserve_all_thinking=True, + preserve_thinking_between_tool_calls=False, ) setup_pool_mock.assert_awaited_once_with( rollout_client_config, @@ -89,6 +93,8 @@ async def run() -> None: tool_parser=None, reasoning_parser=None, renderer_pool_size=None, + preserve_all_thinking=True, + preserve_thinking_between_tool_calls=False, ) asyncio.run(run()) diff --git a/tests/unit/orchestrator/test_trajectories.py b/tests/unit/orchestrator/test_trajectories.py index 6fa169760c..303a02fd11 100644 --- a/tests/unit/orchestrator/test_trajectories.py +++ b/tests/unit/orchestrator/test_trajectories.py @@ -3,6 +3,7 @@ from unittest.mock import MagicMock import numpy as np +import pybase64 import pytest import verifiers as vf from PIL import Image @@ -30,6 +31,21 @@ def _decode_pixels(pixel_bytes: bytes, shape: list[int]) -> list[list[float]]: return np.frombuffer(pixel_bytes, dtype=np.float32).reshape(shape).tolist() +def _routed_experts_payload(data) -> dict: + arr = np.asarray(data, dtype=np.uint8) + return { + "data": pybase64.b64encode(memoryview(np.ascontiguousarray(arr))).decode("ascii"), + "shape": list(arr.shape), + } + + +def _sample_routed_experts(sample) -> np.ndarray: + assert sample.routed_experts is not None + return np.frombuffer(sample.routed_experts.data, dtype=np.dtype(sample.routed_experts.dtype)).reshape( + sample.routed_experts.shape + ) + + def test_deserialize_tool_calls_does_not_inject_missing_key(): messages = [{"role": "assistant", "content": "hello"}] @@ -1857,40 +1873,43 @@ def test_align_routed_experts_none(): def test_align_routed_experts_empty(): - result = _align_routed_experts([], 10) - assert result == [] + experts = np.empty((0, 2, 2), dtype=np.uint8) + result = _align_routed_experts(experts, 10) + assert result is not None + assert result.shape == (10, 2, 2) + assert np.all(result == 0) def test_align_routed_experts_no_deficit(): # 3 tokens, 2 layers, topk=2 - experts = [[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[0, 2], [1, 3]]] + experts = np.asarray([[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[0, 2], [1, 3]]], dtype=np.uint8) result = _align_routed_experts(experts, expected_len=3) - assert result == experts + np.testing.assert_array_equal(result, experts) def test_align_routed_experts_with_deficit(): # 2 tokens but expected 4 (deficit of 2) - experts = [[[1, 2], [3, 4]], [[5, 6], [7, 0]]] + experts = np.asarray([[[1, 2], [3, 4]], [[5, 6], [7, 0]]], dtype=np.uint8) result = _align_routed_experts(experts, expected_len=4) - assert len(result) == 4 - assert result[:2] == experts + assert result is not None + assert result.shape == (4, 2, 2) + np.testing.assert_array_equal(result[:2], experts) # Padded entries should be zero-filled with same shape [layers=2, topk=2] - assert result[2] == [[0, 0], [0, 0]] - assert result[3] == [[0, 0], [0, 0]] + np.testing.assert_array_equal(result[2], [[0, 0], [0, 0]]) + np.testing.assert_array_equal(result[3], [[0, 0], [0, 0]]) def test_align_routed_experts_excess_length(): - experts = [[[1, 2]], [[3, 4]], [[5, 6]]] + experts = np.asarray([[[1, 2]], [[3, 4]], [[5, 6]]], dtype=np.uint8) result = _align_routed_experts(experts, expected_len=2) - # No truncation, just returns as-is - assert result == experts + np.testing.assert_array_equal(result, experts[:2]) def test_interleave_rollout_single_step_with_routed_experts(): """Routed experts are aligned and passed through for a single-step trajectory.""" # prompt_ids=[1,2], completion_ids=[3,4] -> total 4 tokens # vLLM returns num_tokens-1 = 3 routed expert entries - routed_experts_from_vllm = [[[0, 1]], [[2, 3]], [[4, 5]]] # 3 entries, 1 layer, topk=2 + routed_experts_from_vllm = np.asarray([[[0, 1]], [[2, 3]], [[4, 5]]], dtype=np.uint8) output = vf.RolloutOutput( example_id=0, trajectory=[ @@ -1906,7 +1925,7 @@ def test_interleave_rollout_single_step_with_routed_experts(): completion_logprobs=[-0.1, -0.2], overlong_prompt=False, is_truncated=False, - routed_experts=routed_experts_from_vllm, + routed_experts=_routed_experts_payload(routed_experts_from_vllm), ), reward=None, advantage=None, @@ -1926,18 +1945,19 @@ def test_interleave_rollout_single_step_with_routed_experts(): # Should be aligned to 4 tokens (2 prompt + 2 completion) assert sample.routed_experts is not None - assert len(sample.routed_experts) == 4 + routed_experts = _sample_routed_experts(sample) + assert routed_experts.shape == (4, 1, 2) # First 3 are original, last one is zero-padded - assert sample.routed_experts[:3] == routed_experts_from_vllm - assert sample.routed_experts[3] == [[0, 0]] + np.testing.assert_array_equal(routed_experts[:3], routed_experts_from_vllm) + np.testing.assert_array_equal(routed_experts[3], [[0, 0]]) def test_interleave_rollout_multi_step_with_routed_experts(): """Routed experts are extended and aligned across multi-step trajectories.""" # Step 1: prompt=[1,2], completion=[3,4] -> 4 tokens, vLLM returns 3 - step1_experts = [[[1, 2]], [[3, 4]], [[5, 6]]] + step1_experts = np.asarray([[[1, 2]], [[3, 4]], [[5, 6]]], dtype=np.uint8) # Step 2: prompt=[1,2,3,4,5,6], completion=[7,8] -> 8 tokens, vLLM returns 7 - step2_experts = [[[1, 0]], [[2, 0]], [[3, 0]], [[4, 0]], [[5, 0]], [[6, 0]], [[7, 0]]] + step2_experts = np.asarray([[[1, 0]], [[2, 0]], [[3, 0]], [[4, 0]], [[5, 0]], [[6, 0]], [[7, 0]]], dtype=np.uint8) output = vf.RolloutOutput( example_id=0, @@ -1954,7 +1974,7 @@ def test_interleave_rollout_multi_step_with_routed_experts(): completion_logprobs=[-0.1, -0.2], overlong_prompt=False, is_truncated=False, - routed_experts=step1_experts, + routed_experts=_routed_experts_payload(step1_experts), ), reward=None, advantage=None, @@ -1978,7 +1998,7 @@ def test_interleave_rollout_multi_step_with_routed_experts(): completion_logprobs=[-0.3, -0.4], overlong_prompt=False, is_truncated=False, - routed_experts=step2_experts, + routed_experts=_routed_experts_payload(step2_experts), ), reward=None, advantage=None, @@ -1999,7 +2019,7 @@ def test_interleave_rollout_multi_step_with_routed_experts(): # Merged sample: prompt=[1,2], completion=[3,4,5,6,7,8] -> 8 tokens total assert len(sample.prompt_ids) + len(sample.completion_ids) == 8 assert sample.routed_experts is not None - assert len(sample.routed_experts) == 8 + assert _sample_routed_experts(sample).shape == (8, 1, 2) def test_interleave_rollout_none_routed_experts_stays_none(): diff --git a/tests/unit/utils/test_client.py b/tests/unit/utils/test_client.py index 6b48790ef3..3b13e30bd3 100644 --- a/tests/unit/utils/test_client.py +++ b/tests/unit/utils/test_client.py @@ -62,10 +62,13 @@ def test_setup_clients_assigns_renderer_and_dp_rank_headers(): client_config, client_type="renderer", renderer_name="qwen3_vl", + preserve_all_thinking=True, ) assert [client.client_type for client in clients] == ["renderer", "renderer"] assert [client.renderer for client in clients] == ["qwen3_vl", "qwen3_vl"] + assert [client.preserve_all_thinking for client in clients] == [True, True] + assert [client.preserve_thinking_between_tool_calls for client in clients] == [False, False] assert [client.renderer_model_name for client in clients] == [None, None] assert [client.api_base_url for client in clients] == ["http://worker-a:8000/v1"] * 2 assert [client.extra_headers["X-data-parallel-rank"] for client in clients] == ["0", "1"] diff --git a/uv.lock b/uv.lock index e5c35957d3..d3d6d567fe 100644 --- a/uv.lock +++ b/uv.lock @@ -11,38 +11,39 @@ supported-markers = [ ] [options] -exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. +exclude-newer = "2026-05-09T02:04:57.89664956Z" exclude-newer-span = "P7D" [options.exclude-newer-package] -vllm = false verifiers = false -vllm-router = false dion = false alphabet-sort = false science-env = false -color-codeword = false -nixl-cu12 = false -flash-attn-3 = false -prime-tunnel = false -prime = false -deep-gemm = false -aime2024 = false prime-evals = false deepdive = false -prime-sandboxes = false reverse-text = false code-env = false mini-swe-agent-plus = false deep-ep = false pydantic-config = false renderers = false -math-env = false -logic-env = false wiki-search = false math-python = false math500 = false aime2025 = false +vllm = false +vllm-router = false +color-codeword = false +nixl-cu12 = false +flash-attn-3 = false +prime-tunnel = false +deep-gemm = false +aime2024 = false +tokenspeed-mla = false +math-env = false +logic-env = false +prime-sandboxes = false +prime = false [manifest] members = [ @@ -1890,15 +1891,18 @@ wheels = [ name = "mistral-common" version = "1.11.0" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'aarch64' and sys_platform == 'linux'", +] dependencies = [ - { name = "jsonschema", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "numpy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "pillow", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "pydantic", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "pydantic-extra-types", extra = ["pycountry"], marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "requests", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "tiktoken", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "typing-extensions", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "jsonschema", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pydantic", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pydantic-extra-types", extra = ["pycountry"], marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "requests", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "tiktoken", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "typing-extensions", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/61/97/753c85b5c0a19f4331ac99e0300ac8da06d4b29b629c9cb03064b38561bd/mistral_common-1.11.0.tar.gz", hash = "sha256:439b7fa38f9c3f020154af51bdf30eb81def507643017d8ce9f798384ec47ec3", size = 6355512, upload-time = "2026-04-01T13:54:12.36Z" } wheels = [ @@ -1907,7 +1911,34 @@ wheels = [ [package.optional-dependencies] image = [ - { name = "opencv-python-headless", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "opencv-python-headless", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, +] + +[[package]] +name = "mistral-common" +version = "1.11.2" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] +dependencies = [ + { name = "jsonschema", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "numpy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "pillow", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "pydantic", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "pydantic-extra-types", extra = ["pycountry"], marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "requests", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "tiktoken", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "typing-extensions", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c2/eb/12167a1bea9714582e5b4f539f9c019323363e314a499c72855ff0e5ad43/mistral_common-1.11.2.tar.gz", hash = "sha256:79f68fc2d1190f28637f40e053f919c8c2697e00b2aa679ddee562a95183f4ad", size = 6357845, upload-time = "2026-05-04T19:47:40.413Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/f0/6a5d604b972e442b9d36c117d01788feddad099e4965699e3516ee6fefc3/mistral_common-1.11.2-py3-none-any.whl", hash = "sha256:ebb42062cd705a0aa2bc69b4cde2b83d446ae58150b7e29322c90cb08fcfca6c", size = 6531968, upload-time = "2026-05-04T19:47:37.718Z" }, +] + +[package.optional-dependencies] +image = [ + { name = "opencv-python-headless", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] [[package]] @@ -1964,20 +1995,44 @@ wheels = [ name = "model-hosting-container-standards" version = "0.1.13" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'aarch64' and sys_platform == 'linux'", +] dependencies = [ - { name = "fastapi", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "httpx", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "jmespath", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "pydantic", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "setuptools", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "starlette", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "supervisor", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "fastapi", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "httpx", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "jmespath", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pydantic", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "setuptools", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "starlette", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "supervisor", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/d7/b7/a6a31b4dfd30d14b1019dc358f09c9d88ca38e555ba7c976e7d3e6b593fe/model_hosting_container_standards-0.1.13.tar.gz", hash = "sha256:27a1333410dde2719286a300a2803e24fdde407baa91894eb845c0f268aa194d", size = 79116, upload-time = "2026-01-09T21:45:20.683Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/8c/37/6dc61971ba31450bbed460b5f40543f0915e352680534e3bcaf57116d8d7/model_hosting_container_standards-0.1.13-py3-none-any.whl", hash = "sha256:be307d4a988cc660df4e6bd8bdedb7917844bac940e332f9fd001cb385d7994c", size = 105738, upload-time = "2026-01-09T21:45:18.959Z" }, ] +[[package]] +name = "model-hosting-container-standards" +version = "0.1.15" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] +dependencies = [ + { name = "fastapi", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "httpx", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "jmespath", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "pydantic", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "setuptools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "starlette", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "supervisor", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/03/5a/d669bdeb5ba96db42c6ef010835a25119b05f8c35ee5f1c3f715626625fe/model_hosting_container_standards-0.1.15.tar.gz", hash = "sha256:ae8dd74d3250545c14f0a7068186c7b0f0ab6563d31e7137f556b6b660c8a6a9", size = 93994, upload-time = "2026-05-05T18:22:29.357Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/61/26/c7aea197f1719f31d0dd686eb4475982fe9efd7668ce259cb52b62c676b6/model_hosting_container_standards-0.1.15-py3-none-any.whl", hash = "sha256:849e08c4732203ee861c8c24966b4e916ea4420fa324b430f7f74a1e1fe8811a", size = 125418, upload-time = "2026-05-05T18:22:27.819Z" }, +] + [[package]] name = "mpmath" version = "1.3.0" @@ -2767,6 +2822,7 @@ dependencies = [ { name = "prime", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "prime-rl-configs", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "pyarrow", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "pybase64", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "pyzmq", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "renderers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "rich", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, @@ -2782,8 +2838,8 @@ dependencies = [ { name = "transformers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "uvloop", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "verifiers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "vllm", version = "0.20.2rc1.dev354+g24337fb86.cu129", source = { url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/vllm-0.20.2rc1.dev354+g24337fb86.cu129-cp38-abi3-manylinux_2_34_x86_64.whl" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "vllm", version = "0.20.2+cu129", source = { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_aarch64.whl" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "vllm", version = "0.20.2+cu129", source = { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_x86_64.whl" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "wandb", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ] @@ -2890,6 +2946,7 @@ requires-dist = [ { name = "prime-rl", extras = ["quack"], marker = "extra == 'all'" }, { name = "prime-rl-configs", editable = "packages/prime-rl-configs" }, { name = "pyarrow", specifier = ">=21.0.0" }, + { name = "pybase64", specifier = ">=1.4.2" }, { name = "pyzmq", specifier = ">=27.1.0" }, { name = "quack-kernels", marker = "extra == 'quack'", specifier = ">=0.3.3" }, { name = "renderers", specifier = "==0.1.6" }, @@ -2907,11 +2964,11 @@ requires-dist = [ { name = "torchvision", index = "https://download.pytorch.org/whl/cu128" }, { name = "transformers", git = "https://github.com/huggingface/transformers.git?rev=c1c3424" }, { name = "uvloop", specifier = ">=0.21.0" }, - { name = "verifiers", git = "https://github.com/PrimeIntellect-ai/verifiers.git?rev=aa428f3" }, - { name = "vllm", marker = "platform_machine != 'aarch64' and platform_machine != 'x86_64'", specifier = ">=0.20.2" }, + { name = "verifiers", git = "https://github.com/PrimeIntellect-ai/verifiers?rev=3708ede" }, + { name = "vllm", marker = "platform_machine != 'aarch64' and platform_machine != 'x86_64'" }, { name = "vllm", marker = "platform_machine == 'aarch64'", url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_aarch64.whl" }, - { name = "vllm", marker = "platform_machine == 'x86_64'", url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_x86_64.whl" }, - { name = "vllm-router", marker = "platform_machine == 'x86_64' and extra == 'disagg'", url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.22/vllm_router-0.1.22-cp38-abi3-manylinux_2_28_x86_64.whl" }, + { name = "vllm", marker = "platform_machine == 'x86_64'", url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/vllm-0.20.2rc1.dev354+g24337fb86.cu129-cp38-abi3-manylinux_2_34_x86_64.whl" }, + { name = "vllm-router", marker = "platform_machine == 'x86_64' and extra == 'disagg'", url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.25/vllm_router-0.1.25-cp38-abi3-manylinux_2_28_x86_64.whl" }, { name = "wandb", specifier = ">=0.26.1" }, { name = "wiki-search", marker = "extra == 'envs'", index = "https://hub.primeintellect.ai/primeintellect/simple/" }, ] @@ -3858,6 +3915,28 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/05/a1/d62dfe7376beaaf1394917e0f8e93ee5f67fea8fcf4107501db35996586b/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38337540fbbddff8e999d59970f3c6f35a82de10053206a7562f1ea02d046fa5", size = 10033429, upload-time = "2026-01-05T10:45:14.333Z" }, ] +[[package]] +name = "tokenspeed-mla" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "apache-tvm-ffi", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cutlass-dsl", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "tokenspeed-triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "torch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/01/4bf8b74ead3e8e7c1c809435396254c067a33fde48acc20f602aae622d97/tokenspeed_mla-0.1.2-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:c9466a351fe039792e56cf49f3e79744c1dc28c7af10306a02e62b8e92fa5985", size = 748681, upload-time = "2026-05-13T03:30:56.718Z" }, +] + +[[package]] +name = "tokenspeed-triton" +version = "3.7.10.post20260505" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/c3/4808d86016368fed9495c3a3408cc7f912e7863ff3432937404bd0a551a6/tokenspeed_triton-3.7.10.post20260505-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:19618c7db01a9bd33885f7acbf8945adb2f5534668aa97629b56d481753cbcad", size = 89127692, upload-time = "2026-05-05T07:49:04.22Z" }, +] + [[package]] name = "toml" version = "0.10.2" @@ -4197,8 +4276,8 @@ wheels = [ [[package]] name = "verifiers" -version = "0.1.14" -source = { git = "https://github.com/PrimeIntellect-ai/verifiers.git?rev=aa428f3#aa428f3941ae35a7cf7c0dad7e60c7eca525bac6" } +version = "0.1.15.dev5" +source = { git = "https://github.com/PrimeIntellect-ai/verifiers?rev=3708ede#3708ede31d16b77866befa3c7a97cf94b5062cd3" } dependencies = [ { name = "aiolimiter", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "anthropic", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, @@ -4215,6 +4294,7 @@ dependencies = [ { name = "openai-agents", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "prime-sandboxes", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "prime-tunnel", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "pybase64", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "pydantic", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "pyzmq", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "regex", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, @@ -4223,7 +4303,6 @@ dependencies = [ { name = "setproctitle", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "tenacity", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "textual", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "wget", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ] [[package]] @@ -4242,83 +4321,84 @@ wheels = [ [[package]] name = "vllm" -version = "0.20.2+cu129" -source = { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_aarch64.whl" } +version = "0.20.2rc1.dev354+g24337fb86.cu129" +source = { url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/vllm-0.20.2rc1.dev354+g24337fb86.cu129-cp38-abi3-manylinux_2_34_x86_64.whl" } resolution-markers = [ - "platform_machine == 'aarch64' and sys_platform == 'linux'", + "platform_machine == 'x86_64' and sys_platform == 'linux'", ] dependencies = [ - { name = "aiohttp", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "anthropic", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "apache-tvm-ffi", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "blake3", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "cachetools", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "cbor2", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "cloudpickle", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "compressed-tensors", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "depyf", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "diskcache", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "einops", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "fastapi", extra = ["standard"], marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "fastsafetensors", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "filelock", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "flashinfer-cubin", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "flashinfer-python", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "gguf", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "ijson", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "lark", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "llguidance", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "lm-format-enforcer", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "mcp", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "mistral-common", extra = ["image"], marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "model-hosting-container-standards", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "msgspec", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "ninja", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "numba", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "nvidia-cudnn-frontend", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "nvidia-cutlass-dsl", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "openai", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "openai-harmony", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "opencv-python-headless", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "opentelemetry-api", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "opentelemetry-exporter-otlp", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "opentelemetry-sdk", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "opentelemetry-semantic-conventions-ai", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "outlines-core", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "partial-json-parser", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "prometheus-client", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "prometheus-fastapi-instrumentator", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "protobuf", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "psutil", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "py-cpuinfo", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "pybase64", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "pydantic", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "python-json-logger", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "pyyaml", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "pyzmq", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "quack-kernels", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "regex", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "requests", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "sentencepiece", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "setproctitle", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "setuptools", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "six", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "tiktoken", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "tilelang", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "tokenizers", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torchaudio", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "tqdm", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "transformers", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "typing-extensions", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "watchfiles", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "xgrammar", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "aiohttp", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "anthropic", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "apache-tvm-ffi", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "blake3", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "cachetools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "cbor2", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "cloudpickle", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "compressed-tensors", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "depyf", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "diskcache", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "einops", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "fastapi", extra = ["standard"], marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "fastsafetensors", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "filelock", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "flashinfer-cubin", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "flashinfer-python", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "gguf", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "ijson", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "lark", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "llguidance", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "lm-format-enforcer", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "mcp", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "mistral-common", version = "1.11.2", source = { registry = "https://pypi.org/simple" }, extra = ["image"], marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "model-hosting-container-standards", version = "0.1.15", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "msgspec", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "ninja", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "numba", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "numpy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-frontend", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cutlass-dsl", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "openai", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "openai-harmony", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "opencv-python-headless", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "opentelemetry-api", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "opentelemetry-exporter-otlp", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "opentelemetry-sdk", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "opentelemetry-semantic-conventions-ai", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "outlines-core", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "partial-json-parser", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "pillow", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "prometheus-client", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "prometheus-fastapi-instrumentator", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "protobuf", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "psutil", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "py-cpuinfo", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "pybase64", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "pydantic", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "python-json-logger", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "pyyaml", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "pyzmq", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "quack-kernels", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "regex", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "requests", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "sentencepiece", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "setproctitle", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "setuptools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "six", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "tiktoken", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "tilelang", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "tokenizers", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "tokenspeed-mla", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "torch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "torchaudio", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "torchvision", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "tqdm", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "transformers", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "typing-extensions", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "watchfiles", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "xgrammar", version = "0.2.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_aarch64.whl", hash = "sha256:8a58a086c5c4ed2883eee36aaaf6b79c83463d02da3015454acf92afcc8e150e" }, + { url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/vllm-0.20.2rc1.dev354+g24337fb86.cu129-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a16f4fd2d468f0bb0afd84e3e96f4016654e8525892879909f7a095e33101668" }, ] [package.metadata] @@ -4352,14 +4432,14 @@ requires-dist = [ { name = "matplotlib", marker = "extra == 'bench'" }, { name = "mcp" }, { name = "mistral-common", extras = ["audio"], marker = "extra == 'audio'" }, - { name = "mistral-common", extras = ["image"], specifier = ">=1.11.0" }, - { name = "model-hosting-container-standards", specifier = ">=0.1.13,<1.0.0" }, + { name = "mistral-common", extras = ["image"], specifier = ">=1.11.2" }, + { name = "model-hosting-container-standards", specifier = ">=0.1.14,<1.0.0" }, { name = "msgspec" }, { name = "ninja" }, { name = "numba", specifier = "==0.65.0" }, { name = "numpy" }, { name = "nvidia-cudnn-frontend", specifier = ">=1.13.0,<1.19.0" }, - { name = "nvidia-cutlass-dsl", specifier = ">=4.4.2" }, + { name = "nvidia-cutlass-dsl", specifier = "==4.5.0" }, { name = "openai", specifier = ">=2.0.0" }, { name = "openai-harmony", specifier = ">=0.0.3" }, { name = "opencv-python-headless", specifier = ">=4.13.0" }, @@ -4403,6 +4483,7 @@ requires-dist = [ { name = "tiktoken", specifier = ">=0.6.0" }, { name = "tilelang", specifier = "==0.1.9" }, { name = "tokenizers", specifier = ">=0.21.1" }, + { name = "tokenspeed-mla", specifier = "==0.1.2" }, { name = "torch", specifier = "==2.11.0" }, { name = "torchaudio", specifier = "==2.11.0" }, { name = "torchvision", specifier = "==0.26.0" }, @@ -4410,7 +4491,7 @@ requires-dist = [ { name = "transformers", specifier = ">=4.56.0,!=5.0.*,!=5.1.*,!=5.2.*,!=5.3.*,!=5.4.*,!=5.5.0" }, { name = "typing-extensions", specifier = ">=4.10" }, { name = "watchfiles" }, - { name = "xgrammar", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'ppc64le' or platform_machine == 's390x' or platform_machine == 'x86_64'", specifier = ">=0.1.32,<1.0.0" }, + { name = "xgrammar", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'ppc64le' or platform_machine == 's390x' or platform_machine == 'x86_64'", specifier = ">=0.2.0,<1.0.0" }, { name = "zentorch-weekly", marker = "extra == 'zen'", specifier = "==5.2.1.dev20260408" }, ] provides-extras = ["zen", "bench", "tensorizer", "fastsafetensors", "instanttensor", "runai", "audio", "video", "flashinfer", "helion", "grpc", "otel"] @@ -4418,82 +4499,82 @@ provides-extras = ["zen", "bench", "tensorizer", "fastsafetensors", "instanttens [[package]] name = "vllm" version = "0.20.2+cu129" -source = { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_x86_64.whl" } +source = { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_aarch64.whl" } resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", + "platform_machine == 'aarch64' and sys_platform == 'linux'", ] dependencies = [ - { name = "aiohttp", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "anthropic", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "apache-tvm-ffi", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "blake3", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "cachetools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "cbor2", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "cloudpickle", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "compressed-tensors", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "depyf", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "diskcache", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "einops", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "fastapi", extra = ["standard"], marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "fastsafetensors", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "filelock", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "flashinfer-cubin", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "flashinfer-python", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "gguf", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "ijson", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "lark", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "llguidance", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "lm-format-enforcer", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "mcp", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "mistral-common", extra = ["image"], marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "model-hosting-container-standards", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "msgspec", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "ninja", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "numba", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "numpy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cudnn-frontend", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cutlass-dsl", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "openai", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "openai-harmony", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "opencv-python-headless", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "opentelemetry-api", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "opentelemetry-exporter-otlp", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "opentelemetry-sdk", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "opentelemetry-semantic-conventions-ai", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "outlines-core", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "partial-json-parser", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "pillow", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "prometheus-client", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "prometheus-fastapi-instrumentator", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "protobuf", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "psutil", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "py-cpuinfo", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "pybase64", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "pydantic", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "python-json-logger", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "pyyaml", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "pyzmq", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "quack-kernels", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "regex", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "requests", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "sentencepiece", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "setproctitle", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "setuptools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "six", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "tiktoken", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "tilelang", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "tokenizers", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "torch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "torchaudio", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "torchvision", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "tqdm", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "transformers", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "typing-extensions", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "watchfiles", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "xgrammar", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "aiohttp", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "anthropic", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "apache-tvm-ffi", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "blake3", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "cachetools", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "cbor2", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "cloudpickle", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "compressed-tensors", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "depyf", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "diskcache", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "einops", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "fastapi", extra = ["standard"], marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "fastsafetensors", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "filelock", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "flashinfer-cubin", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "flashinfer-python", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "gguf", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "ijson", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "lark", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "llguidance", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "lm-format-enforcer", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "mcp", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "mistral-common", version = "1.11.0", source = { registry = "https://pypi.org/simple" }, extra = ["image"], marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "model-hosting-container-standards", version = "0.1.13", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "msgspec", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "ninja", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "numba", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-frontend", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "nvidia-cutlass-dsl", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "openai", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "openai-harmony", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "opencv-python-headless", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "opentelemetry-api", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "opentelemetry-exporter-otlp", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "opentelemetry-sdk", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "opentelemetry-semantic-conventions-ai", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "outlines-core", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "partial-json-parser", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "prometheus-client", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "prometheus-fastapi-instrumentator", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "protobuf", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "psutil", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "py-cpuinfo", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pybase64", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pydantic", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "python-json-logger", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pyyaml", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pyzmq", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "quack-kernels", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "regex", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "requests", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "sentencepiece", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "setproctitle", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "setuptools", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "six", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "tiktoken", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "tilelang", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "tokenizers", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchaudio", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "tqdm", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "transformers", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "typing-extensions", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "watchfiles", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "xgrammar", version = "0.1.33", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_x86_64.whl", hash = "sha256:2f8c2bf2ac6d3d16f930535e66822abd71065468521884eb5b910225b2abef4b" }, + { url = "https://github.com/vllm-project/vllm/releases/download/v0.20.2/vllm-0.20.2+cu129-cp38-abi3-manylinux_2_31_aarch64.whl", hash = "sha256:8a58a086c5c4ed2883eee36aaaf6b79c83463d02da3015454acf92afcc8e150e" }, ] [package.metadata] @@ -4592,8 +4673,8 @@ provides-extras = ["zen", "bench", "tensorizer", "fastsafetensors", "instanttens [[package]] name = "vllm-router" -version = "0.1.22" -source = { url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.22/vllm_router-0.1.22-cp38-abi3-manylinux_2_28_x86_64.whl" } +version = "0.1.25" +source = { url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.25/vllm_router-0.1.25-cp38-abi3-manylinux_2_28_x86_64.whl" } dependencies = [ { name = "aiohttp", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "fastapi", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, @@ -4603,7 +4684,7 @@ dependencies = [ { name = "uvicorn", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.22/vllm_router-0.1.22-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:6361a0387241e56932f3ba2e51af27f58d11a462e3187e58286b2f96056e4d15" }, + { url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.25/vllm_router-0.1.25-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:e84e731a0779f820bfe3cf4ce78cea2d09993c0a6501c63bcda93826bcd21fd0" }, ] [package.metadata] @@ -4711,12 +4792,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/52/24/ab44c871b0f07f491e5d2ad12c9bd7358e527510618cb1b803a88e986db1/werkzeug-3.1.3-py3-none-any.whl", hash = "sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e", size = 224498, upload-time = "2024-11-08T15:52:16.132Z" }, ] -[[package]] -name = "wget" -version = "3.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip", hash = "sha256:35e630eca2aa50ce998b9b1a127bb26b30dfee573702782aa982f875e3f16061", size = 10857, upload-time = "2015-10-22T15:26:37.51Z" } - [[package]] name = "widgetsnbextension" version = "4.0.14" @@ -4744,18 +4819,40 @@ wheels = [ name = "xgrammar" version = "0.1.33" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'aarch64' and sys_platform == 'linux'", +] dependencies = [ - { name = "numpy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "pydantic", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "torch", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "transformers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "typing-extensions", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pydantic", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "transformers", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "typing-extensions", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/db/43/e5dfddb1d2a4fccf3e3a88f103e88698cdefc3182f4e169a359ffe1c1794/xgrammar-0.1.33.tar.gz", hash = "sha256:8dbe5fc3d76651ab1fac7a68fc2a118b885fa0ec7189927fb6e0dce0081aea99", size = 2398956, upload-time = "2026-03-27T10:16:36.582Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/4e/04/43d4baca876f5ae1b45897ec30a59801a2da37f16da1fcd85f9555e4c125/xgrammar-0.1.33-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c803e60d791854c5d1f271ece7e1f34d73c82dd4a8b2a06b7af5331482a78ac", size = 42133168, upload-time = "2026-03-27T10:15:16.994Z" }, - { url = "https://files.pythonhosted.org/packages/f0/a8/672833a3cff027253793aa999401d8364896ebf396967e475c7a878b895f/xgrammar-0.1.33-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:52b8eaa533282a0efb0835db6998ae72e7b3c7875d7a52e360ffebff9b78c30a", size = 42205803, upload-time = "2026-03-27T10:15:21.599Z" }, +] + +[[package]] +name = "xgrammar" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] +dependencies = [ + { name = "apache-tvm-ffi", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "numpy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "pydantic", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "torch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "transformers", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "typing-extensions", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a0/54/7e593fc41ffcaf5ac7c0379e0aec0cf03e53a742d1a91f64c6c7e79a6ac1/xgrammar-0.2.0.tar.gz", hash = "sha256:c4f0238a89869343171d43d069b8c5da874f3c2c25f408f20cd5987219a6adef", size = 2421093, upload-time = "2026-05-01T18:33:54.474Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/30/99f4e83821db16d58dd41249ba46038ed47bce274c57ad5567030775fc62/xgrammar-0.2.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a36c744d24d93e178c138486aa02b390a80326b64ff11e222e063a028dd65849", size = 44616361, upload-time = "2026-05-01T18:32:42.536Z" }, ] [[package]]