Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions configs/ci/nightly/multimodal_color_codeword.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ language_model_attr = "model.language_model"
[orchestrator]
batch_size = 256
rollouts_per_example = 16
use_token_client = false
use_renderer = true

[orchestrator.train.sampling]
max_completion_tokens = 64
Expand Down
2 changes: 2 additions & 0 deletions configs/multimodal/rl_color_codeword.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ language_model_attr = "model.language_model"
[orchestrator]
batch_size = 256
rollouts_per_example = 16
use_token_client = false
use_renderer = true


[orchestrator.train.sampling]
Expand Down
100 changes: 100 additions & 0 deletions configs/multimodal/rl_color_codeword_feat_renderer.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# 20-step Qwen3-VL-4B RL run on color-codeword using the renderer multimodal path.
#
# Pair with rl_color_codeword_main_mito.toml for an A/B comparison: same env,
# same hyperparameters, same step count — only difference is the inference
# client. The feat-branch run uses the new RendererClient + Qwen3VLRenderer
# (renderers package with multimodal support); the main-branch baseline uses
# the existing TITO chat-completions path through the inference server.
#
# Compare in W&B project ``multimodal-renderer``:
# - ``kl/sampler_vs_trainer`` should be ~0 on this branch (the renderer
# produces byte-identical tokens to what the trainer re-tokenizes) and
# can spike on main when BPE drifts mid-rollout.
# - ``reward`` and ``loss`` should track within noise — same model, same
# env, same hyperparameters.
# - ``bridge_break_rate`` is renderer-only; surfaces multi-turn extension
# failures.

max_steps = 20
seq_len = 4096
output_dir = "outputs/rl_color_codeword_feat_renderer"
clean_output_dir = true
# Pure on-policy: inference can't run ahead of training, so every rollout
# is generated from the latest policy weights. Removes async/off-policy
# drift as a confound for the sampler-vs-trainer KL.
max_async_level = 0

[model]
name = "Qwen/Qwen3-VL-4B-Instruct"

[model.vlm]
vision_encoder_attr = "model.visual"
language_model_attr = "model.language_model"

[deployment]
num_train_gpus = 1
num_infer_gpus = 1
gpus_per_node = 2

[orchestrator]
batch_size = 16
rollouts_per_example = 8
use_renderer = true
use_token_client = false

# Track zero-advantage groups but don't drop them — we're validating the
# multimodal renderer path on 20 steps, not optimizing training efficiency.
# Step 0 on Qwen3-VL-4B vs color-codeword is likely uniform (all-correct or
# all-wrong) so enforce=True would crash before any training happens.
[[orchestrator.filters]]
type = "gibberish"

[[orchestrator.filters]]
type = "repetition"

[[orchestrator.filters]]
type = "zero_advantage"
enforce = false

[orchestrator.train.sampling]
max_completion_tokens = 64

[[orchestrator.train.env]]
id = "color-codeword"
args = { images_per_turn = 2, max_turns = 2, num_examples = 100, seed = 42 }

[orchestrator.renderer]
name = "auto"
# 64 concurrent rollouts (batch_size=16 × rollouts_per_example=4) want
# more than one tokenizer slot to avoid serialization queueing. The
# image processor (CPU-bound) dominates for VLMs so returns diminish
# past 4; bump to 4 as the default for multimodal runs.
pool_size = 4

[trainer]

[trainer.model]
optimization_dtype = "bfloat16"
reduce_dtype = "bfloat16"

[trainer.optim]
lr = 3e-6

[inference]

[inference.model]
# Workaround for vLLM 0.20.1 Qwen3-VL deepstack buffer bug: when num_scheduled_tokens
# (188) gets padded up to the next cudagraph_capture_size (192), the model's
# _set_deepstack_input_embeds sizes the buffer to 188 but forward() runs with 192,
# triggering "Requested more deepstack tokens than available in buffer". Eager mode
# skips the padding so num_input_tokens == num_scheduled_tokens.
enforce_eager = true

[inference.parallel]
dp = 1
tp = 1

[wandb]
project = "multimodal-renderer"
name = "feat-renderer-20step-r8-i2-t2-onpolicy"
tags = ["qwen3vl-4b", "color-codeword", "renderer", "feat-branch", "mm-kwargs-generic", "on-policy"]
2 changes: 2 additions & 0 deletions configs/multimodal/rl_color_codeword_test.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ language_model_attr = "model.language_model"
[orchestrator]
batch_size = 16
rollouts_per_example = 2
use_token_client = false
use_renderer = true

[orchestrator.train.sampling]
max_completion_tokens = 32
Expand Down
36 changes: 22 additions & 14 deletions packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1230,20 +1230,6 @@ def validate_client_mode(self):
)
return self

@model_validator(mode="after")
def validate_renderer_vs_vlm(self):
"""The renderer client takes plain message dicts and tokenizes
them client-side. VLMs need server-side image preprocessing and
chat templating, so they must use the token client (TITO) — fail
loudly when both are set."""
if self.use_renderer and self.model.vlm is not None:
raise ValueError(
"orchestrator.use_renderer is not supported for VLMs. Use the token client "
"(``use_token_client=true``, the default) so image preprocessing and chat "
"templating stay on the inference server."
)
return self

@model_validator(mode="after")
def validate_renderer_args(self):
"""``[orchestrator.renderer]`` knobs are only meaningful when
Expand All @@ -1269,6 +1255,28 @@ def validate_renderer_args(self):
)
return self

@model_validator(mode="after")
def vlm_requires_renderer(self):
"""VLMs (``[model.vlm]`` block set) must go through the renderer.

The MITO path for VLMs (chat-completions + server-side image
stripping + orchestrator-side AutoProcessor + VLMImageCache) was
removed: it duplicated processor work, hardcoded a Qwen-VL
tensor schema, and produced a token stream the trainer could
only reconstruct because the orchestrator re-tokenized through
the same processor. The renderer path owns the processor
per-slot, produces byte-identical tokens, and ships generic
``mm_kwargs`` keyed by whatever the model's forward signature
expects.
"""
if self.model.vlm is not None and not self.use_renderer:
raise ValueError(
"orchestrator.use_renderer must be true when model.vlm is set. "
"The MITO path for VLMs has been removed; VLMs must go through "
"a renderer (e.g. Qwen3VLRenderer) that owns the processor."
)
return self
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Breaking VLM config change missing from CHANGELOG

Medium Severity

The new vlm_requires_renderer validator makes orchestrator.use_renderer = true mandatory when model.vlm is set. Previously, VLM configs used use_renderer = false (the default). Any existing VLM config will now fail at load time with a ValueError. This is a breaking configuration change (effectively a removed valid-config combination) that requires a CHANGELOG.md entry per project rules.

Fix in Cursor Fix in Web

Triggered by project rule: BugBot Instructions

Reviewed by Cursor Bugbot for commit b385fbf. Configure here.


@model_validator(mode="after")
def nccl_max_async_level(self):
if self.weight_broadcast.type == "nccl":
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ dependencies = [
"uvloop>=0.21.0",
"torchtitan",
"verifiers",
"renderers==0.1.6",
"renderers>=0.1.7",
"dion",
"tilelang>=0.1.8",
"flash-linear-attention",
Expand Down Expand Up @@ -166,7 +166,7 @@ prime-rl-configs = { workspace = true }
torch = { index = "pytorch-cu128" }
torchvision = { index = "pytorch-cu128" }
torchaudio = { index = "pytorch-cu128" }
verifiers = { git = "https://github.com/PrimeIntellect-ai/verifiers.git", rev = "aa428f3" }
verifiers = { git = "https://github.com/PrimeIntellect-ai/verifiers.git", branch = "feat/renderer-multimodal-passthrough" }
torchtitan = { git = "https://github.com/pytorch/torchtitan", rev = "a1fdd7e" }
dion = { git = "https://github.com/samsja/dion.git", rev = "d891eeb" }
transformers = { git = "https://github.com/huggingface/transformers.git", rev = "c1c3424" }
Expand Down
105 changes: 38 additions & 67 deletions src/prime_rl/orchestrator/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from prime_rl.orchestrator.inference_metrics import InferenceMetricsCollector
from prime_rl.orchestrator.patches import monkey_patch_chat_completion_logprobs, monkey_patch_oai_iterable_types
from prime_rl.orchestrator.trajectories import (
build_vlm_image_cache,
interleave_rollout,
offload_images_to_disk,
pretokenize_rollout_trajectory,
Expand All @@ -33,7 +32,6 @@
import pandas as pd
import verifiers as vf
from renderers.base import create_renderer
from transformers import AutoProcessor

from prime_rl.configs.orchestrator import OrchestratorConfig
from prime_rl.orchestrator.buffer import Buffer
Expand Down Expand Up @@ -134,20 +132,9 @@ async def orchestrate(config: OrchestratorConfig):
else:
teacher_inference_pool = None

# Check if this is a vision-language model (used throughout for VLM-specific paths)
is_vlm = config.model.vlm is not None

# Load tokenizer and processor (processor only for VLM models)
logger.info(f"Initializing tokenizer ({config.tokenizer})")
tokenizer = setup_tokenizer(config.tokenizer)

processor = None
if is_vlm:
logger.info(f"Loading VLM processor for {config.model.name}")
processor = AutoProcessor.from_pretrained(
config.model.name, trust_remote_code=config.model.trust_remote_code, use_fast=True
)

renderer, inference_pool = await setup_rollout_inference_pool(
config=config,
rollout_client_config=rollout_client_config,
Expand All @@ -156,6 +143,18 @@ async def orchestrate(config: OrchestratorConfig):
logger=logger,
)

# Token-id → modality marker (1 = image patch, 2 = video patch) used
# to build ``mm_token_type_ids`` per sample. The renderer is the
# single source of truth — it already knows its own special-token
# IDs (``<|image_pad|>`` etc.) from the tokenizer it owns, so the
# orchestrator never needs to load a separate ``AutoProcessor``.
# Text-only renderers expose an empty map (or no attribute).
mm_token_type_ids_mapping: dict[int, int] | None = (
getattr(renderer, "mm_token_type_id_map", None) if renderer is not None else None
)
if mm_token_type_ids_mapping == {}:
mm_token_type_ids_mapping = None

# Setup monitor (may register the run and set RUN_ID in the environment)
logger.info(f"Initializing monitor (wandb={config.wandb}, prime_monitor={config.prime_monitor})")
monitor = setup_monitor(
Expand Down Expand Up @@ -470,69 +469,41 @@ async def orchestrate(config: OrchestratorConfig):
save_rollouts, train_rollouts, step_path / "train_rollouts.jsonl", exclude_keys={"trajectory"}
)

# VLM: offload base64 images to disk immediately to free memory
if is_vlm:
offload_start = time.perf_counter()
num_offloaded = offload_images_to_disk(train_rollouts, config.output_dir)
if num_offloaded:
logger.info(
f"VLM offloaded {num_offloaded} unique images to disk in {time.perf_counter() - offload_start:.2f}s"
)
# Offload base64 images to disk to free memory. No-op for text-only
# rollouts (no ``data:image`` URLs to find); cheap to call always.
offload_start = time.perf_counter()
num_offloaded = offload_images_to_disk(train_rollouts, config.output_dir)
if num_offloaded:
logger.info(
f"Offloaded {num_offloaded} unique images to disk in {time.perf_counter() - offload_start:.2f}s"
)

# Convert rollouts to training samples
parallel_preprocess_start = time.perf_counter()

# Stage 1: pretokenize + (for VLM) build image cache concurrently.
# Pretokenize is a no-op when the renderer client already populated
# `tokens` on each trajectory step, but the fallback-tokenizer path
# and image-cache build are both CPU-heavy. Running them on threads
# and awaiting a single gather lets whichever finishes first free
# the event loop immediately and, with max_async_level >= 2, overlaps
# this whole stage with inference for the next batch.
async def _pretokenize_all() -> None:
await asyncio.gather(
*(
asyncio.to_thread(
pretokenize_rollout_trajectory,
rollout,
tokenizer,
processor=processor,
renderer=renderer,
)
for rollout in train_rollouts
# ``tokens`` on each trajectory step (renderer path); the fallback
# tokenizer-only branch handles text-only rollouts whose tokens
# were not pre-rendered. Run on threads so CPU work overlaps with
# inference for the next batch (via max_async_level >= 2).
await asyncio.gather(
*(
asyncio.to_thread(
pretokenize_rollout_trajectory,
rollout,
tokenizer,
renderer=renderer,
)
for rollout in train_rollouts
)

if is_vlm:
mm_token_type_ids_mapping = {}
if hasattr(processor, "image_token_id") and processor.image_token_id is not None:
mm_token_type_ids_mapping[processor.image_token_id] = 1
if hasattr(processor, "video_token_id") and processor.video_token_id is not None:
mm_token_type_ids_mapping[processor.video_token_id] = 2
_, vlm_cache = await asyncio.gather(
_pretokenize_all(),
asyncio.to_thread(build_vlm_image_cache, train_rollouts, processor),
)
logger.info(
f"VLM timing: extract={vlm_cache.extract_time:.2f}s, preprocess={vlm_cache.preprocess_time:.2f}s "
f"({vlm_cache.num_unique_images} unique images from {vlm_cache.num_unique_examples} examples)"
)
else:
await _pretokenize_all()
vlm_cache = None
mm_token_type_ids_mapping = None
)

# Process rollouts in parallel
def process_rollout(rollout: vf.RolloutOutput, rollout_idx: int) -> list[TrainingSample] | None:
return interleave_rollout(
rollout,
vlm_cache=vlm_cache,
cache_key=rollout_idx,
mm_token_type_ids_mapping=mm_token_type_ids_mapping,
)

results = await asyncio.gather(
*(asyncio.to_thread(process_rollout, r, rollout_idx) for rollout_idx, r in enumerate(train_rollouts))
*(
asyncio.to_thread(interleave_rollout, r, mm_token_type_ids_mapping=mm_token_type_ids_mapping)
for r in train_rollouts
)
)

# Collect results and assign advantages. Metrics are computed over all
Expand Down Expand Up @@ -794,7 +765,7 @@ def compute_solve_rates(df):
is_first_step = False

# Free large per-step objects to prevent memory accumulation
del train_rollouts, train_examples, training_batch, vlm_cache
del train_rollouts, train_examples, training_batch
del results_df, metrics_df
gc.collect()

Expand Down
Loading
Loading