PrimeIntellect-ai · hallerite · May 10, 2026 · May 11, 2026 · May 11, 2026 · May 12, 2026
diff --git a/configs/ci/nightly/multimodal_color_codeword.toml b/configs/ci/nightly/multimodal_color_codeword.toml
@@ -16,6 +16,8 @@ language_model_attr = "model.language_model"
 [orchestrator]
 batch_size = 256
 rollouts_per_example = 16
+use_token_client = false
+use_renderer = true
 
 [orchestrator.train.sampling]
 max_completion_tokens = 64

diff --git a/configs/multimodal/rl_color_codeword.toml b/configs/multimodal/rl_color_codeword.toml
@@ -11,6 +11,8 @@ language_model_attr = "model.language_model"
 [orchestrator]
 batch_size = 256
 rollouts_per_example = 16
+use_token_client = false
+use_renderer = true
 
 
 [orchestrator.train.sampling]

diff --git a/configs/multimodal/rl_color_codeword_feat_renderer.toml b/configs/multimodal/rl_color_codeword_feat_renderer.toml
@@ -0,0 +1,100 @@
+# 20-step Qwen3-VL-4B RL run on color-codeword using the renderer multimodal path.
+#
+# Pair with rl_color_codeword_main_mito.toml for an A/B comparison: same env,
+# same hyperparameters, same step count — only difference is the inference
+# client. The feat-branch run uses the new RendererClient + Qwen3VLRenderer
+# (renderers package with multimodal support); the main-branch baseline uses
+# the existing TITO chat-completions path through the inference server.
+#
+# Compare in W&B project ``multimodal-renderer``:
+#   - ``kl/sampler_vs_trainer`` should be ~0 on this branch (the renderer
+#     produces byte-identical tokens to what the trainer re-tokenizes) and
+#     can spike on main when BPE drifts mid-rollout.
+#   - ``reward`` and ``loss`` should track within noise — same model, same
+#     env, same hyperparameters.
+#   - ``bridge_break_rate`` is renderer-only; surfaces multi-turn extension
+#     failures.
+
+max_steps = 20
+seq_len = 4096
+output_dir = "outputs/rl_color_codeword_feat_renderer"
+clean_output_dir = true
+# Pure on-policy: inference can't run ahead of training, so every rollout
+# is generated from the latest policy weights. Removes async/off-policy
+# drift as a confound for the sampler-vs-trainer KL.
+max_async_level = 0
+
+[model]
+name = "Qwen/Qwen3-VL-4B-Instruct"
+
+[model.vlm]
+vision_encoder_attr = "model.visual"
+language_model_attr = "model.language_model"
+
+[deployment]
+num_train_gpus = 1
+num_infer_gpus = 1
+gpus_per_node = 2
+
+[orchestrator]
+batch_size = 16
+rollouts_per_example = 8
+use_renderer = true
+use_token_client = false
+
+# Track zero-advantage groups but don't drop them — we're validating the
+# multimodal renderer path on 20 steps, not optimizing training efficiency.
+# Step 0 on Qwen3-VL-4B vs color-codeword is likely uniform (all-correct or
+# all-wrong) so enforce=True would crash before any training happens.
+[[orchestrator.filters]]
+type = "gibberish"
+
+[[orchestrator.filters]]
+type = "repetition"
+
+[[orchestrator.filters]]
+type = "zero_advantage"
+enforce = false
+
+[orchestrator.train.sampling]
+max_completion_tokens = 64
+
+[[orchestrator.train.env]]
+id = "color-codeword"
+args = { images_per_turn = 2, max_turns = 2, num_examples = 100, seed = 42 }
+
+[orchestrator.renderer]
+name = "auto"
+# 64 concurrent rollouts (batch_size=16 × rollouts_per_example=4) want
+# more than one tokenizer slot to avoid serialization queueing. The
+# image processor (CPU-bound) dominates for VLMs so returns diminish
+# past 4; bump to 4 as the default for multimodal runs.
+pool_size = 4
+
+[trainer]
+
+[trainer.model]
+optimization_dtype = "bfloat16"
+reduce_dtype = "bfloat16"
+
+[trainer.optim]
+lr = 3e-6
+
+[inference]
+
+[inference.model]
+# Workaround for vLLM 0.20.1 Qwen3-VL deepstack buffer bug: when num_scheduled_tokens
+# (188) gets padded up to the next cudagraph_capture_size (192), the model's
+# _set_deepstack_input_embeds sizes the buffer to 188 but forward() runs with 192,
+# triggering "Requested more deepstack tokens than available in buffer". Eager mode
+# skips the padding so num_input_tokens == num_scheduled_tokens.
+enforce_eager = true
+
+[inference.parallel]
+dp = 1
+tp = 1
+
+[wandb]
+project = "multimodal-renderer"
+name = "feat-renderer-20step-r8-i2-t2-onpolicy"
+tags = ["qwen3vl-4b", "color-codeword", "renderer", "feat-branch", "mm-kwargs-generic", "on-policy"]
diff --git a/configs/multimodal/rl_color_codeword_test.toml b/configs/multimodal/rl_color_codeword_test.toml
@@ -12,6 +12,8 @@ language_model_attr = "model.language_model"
 [orchestrator]
 batch_size = 16
 rollouts_per_example = 2
+use_token_client = false
+use_renderer = true
 
 [orchestrator.train.sampling]
 max_completion_tokens = 32

diff --git a/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py b/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py
@@ -1230,20 +1230,6 @@ def validate_client_mode(self):
             )
         return self
 
-    @model_validator(mode="after")
-    def validate_renderer_vs_vlm(self):
-        """The renderer client takes plain message dicts and tokenizes
-        them client-side. VLMs need server-side image preprocessing and
-        chat templating, so they must use the token client (TITO) — fail
-        loudly when both are set."""
-        if self.use_renderer and self.model.vlm is not None:
-            raise ValueError(
-                "orchestrator.use_renderer is not supported for VLMs. Use the token client "
-                "(``use_token_client=true``, the default) so image preprocessing and chat "
-                "templating stay on the inference server."
-            )
-        return self
-
     @model_validator(mode="after")
     def validate_renderer_args(self):
         """``[orchestrator.renderer]`` knobs are only meaningful when
@@ -1269,6 +1255,28 @@ def validate_renderer_args(self):
             )
         return self
 
+    @model_validator(mode="after")
+    def vlm_requires_renderer(self):
+        """VLMs (``[model.vlm]`` block set) must go through the renderer.
+
+        The MITO path for VLMs (chat-completions + server-side image
+        stripping + orchestrator-side AutoProcessor + VLMImageCache) was
+        removed: it duplicated processor work, hardcoded a Qwen-VL
+        tensor schema, and produced a token stream the trainer could
+        only reconstruct because the orchestrator re-tokenized through
+        the same processor. The renderer path owns the processor
+        per-slot, produces byte-identical tokens, and ships generic
+        ``mm_kwargs`` keyed by whatever the model's forward signature
+        expects.
+        """
+        if self.model.vlm is not None and not self.use_renderer:
+            raise ValueError(
+                "orchestrator.use_renderer must be true when model.vlm is set. "
+                "The MITO path for VLMs has been removed; VLMs must go through "
+                "a renderer (e.g. Qwen3VLRenderer) that owns the processor."
+            )
+        return self
+
     @model_validator(mode="after")
     def nccl_max_async_level(self):
         if self.weight_broadcast.type == "nccl":

diff --git a/pyproject.toml b/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     "uvloop>=0.21.0",
     "torchtitan",
     "verifiers",
-    "renderers==0.1.6",
+    "renderers>=0.1.7",
     "dion",
     "tilelang>=0.1.8",
     "flash-linear-attention",
@@ -166,7 +166,7 @@ prime-rl-configs = { workspace = true }
 torch = { index = "pytorch-cu128" }
 torchvision = { index = "pytorch-cu128" }
 torchaudio = { index = "pytorch-cu128" }
-verifiers = { git = "https://github.com/PrimeIntellect-ai/verifiers.git", rev = "aa428f3" }
+verifiers = { git = "https://github.com/PrimeIntellect-ai/verifiers.git", branch = "feat/renderer-multimodal-passthrough" }
 torchtitan = { git = "https://github.com/pytorch/torchtitan", rev = "a1fdd7e" }
 dion = { git = "https://github.com/samsja/dion.git", rev = "d891eeb" }
 transformers = { git = "https://github.com/huggingface/transformers.git", rev = "c1c3424" }

diff --git a/src/prime_rl/orchestrator/orchestrator.py b/src/prime_rl/orchestrator/orchestrator.py
@@ -12,7 +12,6 @@
 from prime_rl.orchestrator.inference_metrics import InferenceMetricsCollector
 from prime_rl.orchestrator.patches import monkey_patch_chat_completion_logprobs, monkey_patch_oai_iterable_types
 from prime_rl.orchestrator.trajectories import (
-    build_vlm_image_cache,
     interleave_rollout,
     offload_images_to_disk,
     pretokenize_rollout_trajectory,
@@ -33,7 +32,6 @@
 import pandas as pd
 import verifiers as vf
 from renderers.base import create_renderer
-from transformers import AutoProcessor
 
 from prime_rl.configs.orchestrator import OrchestratorConfig
 from prime_rl.orchestrator.buffer import Buffer
@@ -134,20 +132,9 @@ async def orchestrate(config: OrchestratorConfig):
     else:
         teacher_inference_pool = None
 
-    # Check if this is a vision-language model (used throughout for VLM-specific paths)
-    is_vlm = config.model.vlm is not None
-
-    # Load tokenizer and processor (processor only for VLM models)
     logger.info(f"Initializing tokenizer ({config.tokenizer})")
     tokenizer = setup_tokenizer(config.tokenizer)
 
-    processor = None
-    if is_vlm:
-        logger.info(f"Loading VLM processor for {config.model.name}")
-        processor = AutoProcessor.from_pretrained(
-            config.model.name, trust_remote_code=config.model.trust_remote_code, use_fast=True
-        )
-
     renderer, inference_pool = await setup_rollout_inference_pool(
         config=config,
         rollout_client_config=rollout_client_config,
@@ -156,6 +143,18 @@ async def orchestrate(config: OrchestratorConfig):
         logger=logger,
     )
 
+    # Token-id → modality marker (1 = image patch, 2 = video patch) used
+    # to build ``mm_token_type_ids`` per sample. The renderer is the
+    # single source of truth — it already knows its own special-token
+    # IDs (``<|image_pad|>`` etc.) from the tokenizer it owns, so the
+    # orchestrator never needs to load a separate ``AutoProcessor``.
+    # Text-only renderers expose an empty map (or no attribute).
+    mm_token_type_ids_mapping: dict[int, int] | None = (
+        getattr(renderer, "mm_token_type_id_map", None) if renderer is not None else None
+    )
+    if mm_token_type_ids_mapping == {}:
+        mm_token_type_ids_mapping = None
+
     # Setup monitor (may register the run and set RUN_ID in the environment)
     logger.info(f"Initializing monitor (wandb={config.wandb}, prime_monitor={config.prime_monitor})")
     monitor = setup_monitor(
@@ -470,69 +469,41 @@ async def orchestrate(config: OrchestratorConfig):
             save_rollouts, train_rollouts, step_path / "train_rollouts.jsonl", exclude_keys={"trajectory"}
         )
 
-        # VLM: offload base64 images to disk immediately to free memory
-        if is_vlm:
-            offload_start = time.perf_counter()
-            num_offloaded = offload_images_to_disk(train_rollouts, config.output_dir)
-            if num_offloaded:
-                logger.info(
-                    f"VLM offloaded {num_offloaded} unique images to disk in {time.perf_counter() - offload_start:.2f}s"
-                )
+        # Offload base64 images to disk to free memory. No-op for text-only
+        # rollouts (no ``data:image`` URLs to find); cheap to call always.
+        offload_start = time.perf_counter()
+        num_offloaded = offload_images_to_disk(train_rollouts, config.output_dir)
+        if num_offloaded:
+            logger.info(
+                f"Offloaded {num_offloaded} unique images to disk in {time.perf_counter() - offload_start:.2f}s"
+            )
 
         # Convert rollouts to training samples
         parallel_preprocess_start = time.perf_counter()
 
-        # Stage 1: pretokenize + (for VLM) build image cache concurrently.
         # Pretokenize is a no-op when the renderer client already populated
-        # `tokens` on each trajectory step, but the fallback-tokenizer path
-        # and image-cache build are both CPU-heavy. Running them on threads
-        # and awaiting a single gather lets whichever finishes first free
-        # the event loop immediately and, with max_async_level >= 2, overlaps
-        # this whole stage with inference for the next batch.
-        async def _pretokenize_all() -> None:
-            await asyncio.gather(
-                *(
-                    asyncio.to_thread(
-                        pretokenize_rollout_trajectory,
-                        rollout,
-                        tokenizer,
-                        processor=processor,
-                        renderer=renderer,
-                    )
-                    for rollout in train_rollouts
+        # ``tokens`` on each trajectory step (renderer path); the fallback
+        # tokenizer-only branch handles text-only rollouts whose tokens
+        # were not pre-rendered. Run on threads so CPU work overlaps with
+        # inference for the next batch (via max_async_level >= 2).
+        await asyncio.gather(
+            *(
+                asyncio.to_thread(
+                    pretokenize_rollout_trajectory,
+                    rollout,
+                    tokenizer,
+                    renderer=renderer,
                 )
+                for rollout in train_rollouts
             )
-
-        if is_vlm:
-            mm_token_type_ids_mapping = {}
-            if hasattr(processor, "image_token_id") and processor.image_token_id is not None:
-                mm_token_type_ids_mapping[processor.image_token_id] = 1
-            if hasattr(processor, "video_token_id") and processor.video_token_id is not None:
-                mm_token_type_ids_mapping[processor.video_token_id] = 2
-            _, vlm_cache = await asyncio.gather(
-                _pretokenize_all(),
-                asyncio.to_thread(build_vlm_image_cache, train_rollouts, processor),
-            )
-            logger.info(
-                f"VLM timing: extract={vlm_cache.extract_time:.2f}s, preprocess={vlm_cache.preprocess_time:.2f}s "
-                f"({vlm_cache.num_unique_images} unique images from {vlm_cache.num_unique_examples} examples)"
-            )
-        else:
-            await _pretokenize_all()
-            vlm_cache = None
-            mm_token_type_ids_mapping = None
+        )
 
         # Process rollouts in parallel
-        def process_rollout(rollout: vf.RolloutOutput, rollout_idx: int) -> list[TrainingSample] | None:
-            return interleave_rollout(
-                rollout,
-                vlm_cache=vlm_cache,
-                cache_key=rollout_idx,
-                mm_token_type_ids_mapping=mm_token_type_ids_mapping,
-            )
-
         results = await asyncio.gather(
-            *(asyncio.to_thread(process_rollout, r, rollout_idx) for rollout_idx, r in enumerate(train_rollouts))
+            *(
+                asyncio.to_thread(interleave_rollout, r, mm_token_type_ids_mapping=mm_token_type_ids_mapping)
+                for r in train_rollouts
+            )
         )
 
         # Collect results and assign advantages. Metrics are computed over all
@@ -794,7 +765,7 @@ def compute_solve_rates(df):
         is_first_step = False
 
         # Free large per-step objects to prevent memory accumulation
-        del train_rollouts, train_examples, training_batch, vlm_cache
+        del train_rollouts, train_examples, training_batch
         del results_df, metrics_df
         gc.collect()