cleanup

LucasWilkinson · LucasWilkinson · commit 7a3b6b6f246f · 2025-11-02T17:30:32.000-08:00
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -24,7 +24,7 @@
     dbo_register_recv_hook,
     dbo_yield,
 )
-from vllm.v1.worker.workspace import WorkspaceSpec, current_workspace_manager
+from vllm.v1.worker.workspace import current_workspace_manager
 
 #
 # This file defines a set of base classes used to make MoE kernels more modular.
@@ -766,48 +766,31 @@ def _allocate_buffers(
                 local_num_experts,
                 None,  # Pass None to avoid using sampled token counts
             )
-            max_workspace13_spec = WorkspaceSpec(
-                shape=max_workspace13_shape,
-                dtype=workspace_dtype,
-                name="moe.workspace13",
-            )
-            max_workspace2_spec = WorkspaceSpec(
-                shape=max_workspace2_shape,
-                dtype=workspace_dtype,
-                name="moe.workspace2",
-            )
-            max_fused_out_spec = WorkspaceSpec(
-                shape=max_fused_out_shape, dtype=out_dtype, name="moe.fused_out"
-            )
-            current_workspace_manager().reserve_simultaneous(
-                max_workspace13_spec, max_workspace2_spec, max_fused_out_spec
+
+            current_workspace_manager().get_simultaneous(
+                (max_workspace13_shape, workspace_dtype),
+                (max_workspace2_shape, workspace_dtype),
+                (max_fused_out_shape, out_dtype),
             )
 
         # We can reuse the memory between cache1 and cache3 because by the
         # time we need cache3, we're done with cache1.
-        workspace13_spec = WorkspaceSpec(
-            shape=workspace13_shape, dtype=workspace_dtype, name="moe.workspace13"
-        )
-        workspace2_spec = WorkspaceSpec(
-            shape=workspace2_shape, dtype=workspace_dtype, name="moe.workspace2"
-        )
-
         # Construct the entire output that can then be processed in chunks.
         # Reuse workspace13 for the output in the non-chunked case as long
         # as it is large enough. This will not always be the case for standard
         # format experts and with experts that have empty workspaces.
         if num_chunks == 1 and prod(workspace13_shape) >= prod(fused_out_shape):
             workspace13, workspace2 = current_workspace_manager().get_simultaneous(
-                workspace13_spec, workspace2_spec
+                (workspace13_shape, workspace_dtype),
+                (workspace2_shape, workspace_dtype),
             )
             fused_out = _resize_cache(workspace13, fused_out_shape)
         else:
-            fused_out_spec = WorkspaceSpec(
-                shape=fused_out_shape, dtype=out_dtype, name="moe.fused_out"
-            )
             workspace13, workspace2, fused_out = (
                 current_workspace_manager().get_simultaneous(
-                    workspace13_spec, workspace2_spec, fused_out_spec
+                    (workspace13_shape, workspace_dtype),
+                    (workspace2_shape, workspace_dtype),
+                    (fused_out_shape, out_dtype),
                 )
             )
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
@@ -86,7 +86,7 @@
     DeepseekV32IndexerMetadata,
 )
 from vllm.v1.kv_cache_interface import KVCacheSpec, MLAAttentionSpec
-from vllm.v1.worker.workspace import WorkspaceSpec, current_workspace_manager
+from vllm.v1.worker.workspace import current_workspace_manager
 
 from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
 from .utils import (
@@ -520,20 +520,13 @@ def sparse_attn_indexer(
     # careful! this will be None in dummy run
     attn_metadata = get_forward_context().attn_metadata
 
-    k_fp8_spec = WorkspaceSpec(
-        shape=(total_seq_lens, head_dim),
-        dtype=torch.float8_e4m3fn,
-        name="sparse_attn_indexer.k_fp8",
-    )
-    k_scale_spec = WorkspaceSpec(
-        shape=(total_seq_lens, 4),
-        dtype=torch.uint8,
-        name="sparse_attn_indexer.k_scale",
-    )
-
     # assert isinstance(attn_metadata, dict)
     if not isinstance(attn_metadata, dict):
-        current_workspace_manager().reserve_simultaneous(k_fp8_spec, k_scale_spec)
+        # Reserve workspace for indexer during profiling run
+        current_workspace_manager().get_simultaneous(
+            ((total_seq_lens, head_dim), torch.float8_e4m3fn),
+            ((total_seq_lens, 4), torch.uint8),
+        )
 
         return sparse_attn_indexer_fake(
             hidden_states,
@@ -572,7 +565,8 @@ def sparse_attn_indexer(
         # Get the full shared workspace buffers once (will allocate on first use)
         workspace_manager = current_workspace_manager()
         k_fp8_full, k_scale_full = workspace_manager.get_simultaneous(
-            k_fp8_spec, k_scale_spec
+            ((total_seq_lens, head_dim), torch.float8_e4m3fn),
+            ((total_seq_lens, 4), torch.uint8),
         )
 
         for chunk in prefill_metadata.chunks:
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -31,7 +31,7 @@
     split_decodes_and_prefills,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
-from vllm.v1.worker.workspace import WorkspaceSpec, current_workspace_manager
+from vllm.v1.worker.workspace import current_workspace_manager
 
 if TYPE_CHECKING:
     from vllm.model_executor.models.deepseek_v2 import Indexer
@@ -636,14 +636,13 @@ def __init__(
         vllm_config = get_current_vllm_config()
         prefill_workspace_size = get_prefill_workspace_size(vllm_config)
 
-        self.prefill_workspace_spec = WorkspaceSpec(
-            shape=(prefill_workspace_size, head_size),
-            dtype=torch.bfloat16,
-            name="FlashMLASparseImpl.prefill_workspace",
-        )
+        self.prefill_workspace_shape = (prefill_workspace_size, head_size)
 
         if kv_cache_dtype == "fp8_ds_mla":
-            current_workspace_manager().reserve(self.prefill_workspace_spec)
+            # Reserve workspace during initialization
+            current_workspace_manager().get(
+                self.prefill_workspace_shape, torch.bfloat16
+            )
 
     def _forward_bf16_kv(
         self,
@@ -810,7 +809,7 @@ def forward(
                 # Process prefill chunks
                 assert attn_metadata.prefill_chunks is not None
                 prefill_bf16_workspace = current_workspace_manager().get(
-                    self.prefill_workspace_spec
+                    self.prefill_workspace_shape, torch.bfloat16
                 )
 
                 for chunk in attn_metadata.prefill_chunks:
diff --git a/vllm/v1/worker/workspace.py b/vllm/v1/worker/workspace.py