ai-dynamo · tedzhouhk · Nov 22, 2025 · hhzhang16 · Nov 24, 2025
@@ -50,6 +50,7 @@
     profile_prefill_aiconfigurator,
 )
 from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser
+from benchmarks.profiler.webui.select_config import pick_config_with_webui
 from deploy.utils.dynamo_deployment import (
     DynamoDeploymentClient,
     cleanup_remaining_deployments,
@@ -494,44 +495,55 @@ async def run_profile(args):
             if not prefill_data.num_gpus:
                 logger.error("No prefill results produced; skipping recommendations.")
 
-            # select best parallel mapping for prefill
-            if min(prefill_data.ttft) > args.ttft:
-                logger.warning(
-                    "No engine configuration satisfies the TTFT requirement, please try a smaller model or more powerful hardware"
+            if args.pick_with_webui:
+                # select best P/D config in webUI
+                selected_prefill_idx, selected_decode_idx = pick_config_with_webui(
+                    prefill_data, decode_data, args
                 )
-                selected_prefill_idx = int(np.argmin(np.array(prefill_data.ttft)))
             else:
-                valid_indices = [
-                    i for i, ttft in enumerate(prefill_data.ttft) if ttft <= args.ttft
-                ]
-                # Among valid TP sizes, select the one with highest throughput per GPU
-                valid_thpts = [prefill_data.thpt_per_gpu[i] for i in valid_indices]
-                max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
-                selected_prefill_idx = max_thpt_idx
-            logger.info(
-                f"Suggested prefill parallel mapping: {prefill_data.parallel_mapping_labels[selected_prefill_idx]} on {prefill_data.num_gpus[selected_prefill_idx]} GPU(s) (TTFT {prefill_data.ttft[selected_prefill_idx]:.2f} ms, throughput {prefill_data.thpt_per_gpu[selected_prefill_idx]:.2f} tokens/s/GPU)"
-            )
+                # automatically select P/D config within SLA with the highest throughput/GPU
+                # select best parallel mapping for prefill
+                if min(prefill_data.ttft) > args.ttft:
+                    logger.warning(
+                        "No engine configuration satisfies the TTFT requirement, please try a smaller model or more powerful hardware"
+                    )
+                    selected_prefill_idx = int(np.argmin(np.array(prefill_data.ttft)))
+                else:
+                    valid_indices = [
+                        i
+                        for i, ttft in enumerate(prefill_data.ttft)
+                        if ttft <= args.ttft
+                    ]
+                    # Among valid TP sizes, select the one with highest throughput per GPU
+                    valid_thpts = [prefill_data.thpt_per_gpu[i] for i in valid_indices]
+                    max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
+                    selected_prefill_idx = max_thpt_idx
+                logger.info(
+                    f"Suggested prefill parallel mapping: {prefill_data.parallel_mapping_labels[selected_prefill_idx]} on {prefill_data.num_gpus[selected_prefill_idx]} GPU(s) (TTFT {prefill_data.ttft[selected_prefill_idx]:.2f} ms, throughput {prefill_data.thpt_per_gpu[selected_prefill_idx]:.2f} tokens/s/GPU)"
+                )
 
-            # select best parallel mapping for decode
-            if not decode_data.num_gpus:
-                logger.error("No decode results produced; skipping recommendations.")
-                return
-            if min(decode_data.itl) > args.itl:
-                logger.warning(
-                    "No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware"
+                # select best parallel mapping for decode
+                if not decode_data.num_gpus:
+                    logger.error(
+                        "No decode results produced; skipping recommendations."
+                    )
+                    return
+                if min(decode_data.itl) > args.itl:
+                    logger.warning(
+                        "No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware"
+                    )
+                    selected_decode_idx = int(np.argmin(np.array(decode_data.itl)))
+                else:
+                    valid_indices = [
+                        i for i, itl in enumerate(decode_data.itl) if itl <= args.itl
+                    ]
+                    # Among valid TP sizes, select the one with highest throughput per GPU
+                    valid_thpts = [decode_data.thpt_per_gpu[i] for i in valid_indices]
+                    max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
+                    selected_decode_idx = max_thpt_idx
+                logger.info(
+                    f"Suggested decode parallel mapping: {decode_data.parallel_mapping_labels[selected_decode_idx]} on {decode_data.num_gpus[selected_decode_idx]} GPU(s) (ITL {decode_data.itl[selected_decode_idx]:.2f} ms, throughput {decode_data.thpt_per_gpu[selected_decode_idx]:.2f} tokens/s/GPU)"
                 )
-                selected_decode_idx = int(np.argmin(np.array(decode_data.itl)))
-            else:
-                valid_indices = [
-                    i for i, itl in enumerate(decode_data.itl) if itl <= args.itl
-                ]
-                # Among valid TP sizes, select the one with highest throughput per GPU
-                valid_thpts = [decode_data.thpt_per_gpu[i] for i in valid_indices]
-                max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
-                selected_decode_idx = max_thpt_idx
-            logger.info(
-                f"Suggested decode parallel mapping: {decode_data.parallel_mapping_labels[selected_decode_idx]} on {decode_data.num_gpus[selected_decode_idx]} GPU(s) (ITL {decode_data.itl[selected_decode_idx]:.2f} ms, throughput {decode_data.thpt_per_gpu[selected_decode_idx]:.2f} tokens/s/GPU)"
-            )
 
         if args.dry_run:
             # use min value for prefill and decode GPU counts

@@ -30,6 +30,9 @@
 AIPERF_PREFILL_BENCHMARK_OSL = 5
 AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO = 4
 
+# Cost calculation defaults
+GPU_COST_PER_HOUR = 3.0  # Cost per GPU per hour in dollars
+
 
 class EngineType(str, Enum):
     PREFILL = "prefill"

@@ -4,33 +4,39 @@
 
 def compute_pareto(x, y):
     """
-    compute the pareto front (top-left is better) for the given x and y values
-    return sorted lists of the x and y values for the pareto front
+    Compute the pareto front (top-left is better) for the given x and y values.
+
+    Returns:
+        tuple: (xs, ys, indices) where:
+            - xs: list of x values on the pareto front
+            - ys: list of y values on the pareto front
+            - indices: list of original indices corresponding to the pareto points
     """
     # Validate inputs
     if x is None or y is None:
-        return [], []
+        return [], [], []
 
     if len(x) != len(y):
         raise ValueError("x and y must have the same length")
 
     if len(x) == 0:
-        return [], []
+        return [], [], []
 
-    # Build point list and sort by x asc, then y desc so we prefer smaller x and larger y.
-    points = list(zip(x, y))
+    # Build point list with original indices and sort by x asc, then y desc
+    points = [(x[i], y[i], i) for i in range(len(x))]
     points.sort(key=lambda p: (p[0], -p[1]))
 
-    # Single pass to keep only non-dominated points (minimize x, maximize y).
+    # Single pass to keep only non-dominated points (minimize x, maximize y)
     pareto = []
     max_y = float("-inf")
-    for px, py in points:
+    for px, py, idx in points:
         if py > max_y:
-            pareto.append((px, py))
+            pareto.append((px, py, idx))
             max_y = py
 
     # Return sorted by x ascending for convenience
     pareto.sort(key=lambda p: (p[0], p[1]))
-    xs = [px for px, _ in pareto]
-    ys = [py for _, py in pareto]
-    return xs, ys
+    xs = [px for px, _, _ in pareto]
+    ys = [py for _, py, _ in pareto]
+    indices = [idx for _, _, idx in pareto]
+    return xs, ys, indices
@@ -21,6 +21,7 @@
 from matplotlib import cm
 from scipy.interpolate import griddata
 
+from benchmarks.profiler.utils.defaults import GPU_COST_PER_HOUR
 from benchmarks.profiler.utils.pareto import compute_pareto
 
 logger = logging.getLogger(__name__)
@@ -297,13 +298,11 @@ def plot_pd_joint_results(isl, osl, prefill_data, decode_data, output_dir):
         decode_data: DecodeProfileData instance containing profiling results
         output_dir: directory to save the plot
     """
-    GPU_COST_PER_HOUR = 3.0  # $3/hour
-
     # compute pareto front for prefill
-    p_ttft, p_thpt = compute_pareto(prefill_data.ttft, prefill_data.thpt_per_gpu)
+    p_ttft, p_thpt, _ = compute_pareto(prefill_data.ttft, prefill_data.thpt_per_gpu)
 
     # compute pareto front for decode
-    d_itl, d_thpt = compute_pareto(decode_data.itl, decode_data.thpt_per_gpu)
+    d_itl, d_thpt, _ = compute_pareto(decode_data.itl, decode_data.thpt_per_gpu)
 
     # convert to cost per thousand requests
     p_ttft = np.array(p_ttft)

@@ -3,6 +3,7 @@
 
 import argparse
 import ast
+import os
 from typing import Any, Dict
 
 import yaml
@@ -84,6 +85,8 @@ def create_profiler_parser() -> argparse.Namespace:
             aic_backend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
             aic_backend_version: String (specify backend version when using aiconfigurator to estimate perf, default: None)
             dry_run: Boolean (dry run the profile job, default: False)
+            pick_with_webui: Boolean (pick the best parallelization mapping using webUI, default: False)
+            webui_port: Int (webUI port, default: $PROFILER_WEBUI_PORT or 8000)
         sla:
             isl: Int (target input sequence length, default: 3000)
             osl: Int (target output sequence length, default: 500)
@@ -113,6 +116,8 @@ def create_profiler_parser() -> argparse.Namespace:
         help="Configuration as Python dict literal, YAML, or JSON string. CLI args override config values. "
         "Example: \"{'engine': {'backend': 'vllm', 'config': '/path'}, 'sla': {'isl': 3000}}\"",
     )
+
+    # CLI arguments with config-aware defaults (using nested .get() for cleaner code)
     parser.add_argument(
         "--model",
         type=str,
@@ -126,7 +131,6 @@ def create_profiler_parser() -> argparse.Namespace:
         help="Container image to use for DGD components (frontend, planner, workers). Overrides images in config file.",
     )
 
-    # CLI arguments with config-aware defaults (using nested .get() for cleaner code)
     parser.add_argument(
         "--namespace",
         type=str,
@@ -233,6 +237,22 @@ def create_profiler_parser() -> argparse.Namespace:
         default=config.get("hardware", {}).get("enable_gpu_discovery", False),
         help="Enable automatic GPU discovery from Kubernetes cluster nodes. When enabled, overrides any manually specified hardware configuration. Requires cluster-wide node access permissions.",
     )
+    parser.add_argument(
+        "--pick-with-webui",
+        action="store_true",
+        default=config.get("sweep", {}).get("pick_with_webui", False),
+        help="Pick the best parallelization mapping using webUI",
+    )
+
+    default_webui_port = 8000
+    if os.environ.get("PROFILER_WEBUI_PORT"):
+        default_webui_port = int(os.environ.get("PROFILER_WEBUI_PORT"))
+    parser.add_argument(
+        "--webui-port",
+        type=int,
+        default=config.get("sweep", {}).get("webui_port", default_webui_port),
+        help="WebUI port",
+    )
 
     # Dynamically add all planner arguments from planner_argparse.py
     add_planner_arguments_to_parser(parser, prefix="planner-")

@@ -0,0 +1,98 @@
+{
+    "settings": {
+        "allow_confirm_datapoint": true,
+        "hide_show_config": true
+    },
+    "prefill": {
+        "chart": {
+            "labels": [],
+            "datasets": [
+                {
+                    "label": "Prefill Performance",
+                    "data": [],
+                    "backgroundColor": "#1f77b4",
+                    "borderColor": "#1f77b4"
+                }
+            ],
+            "target_line": {
+                "value": 0.0,
+                "label": "Target TTFT: ? ms"
+            },
+            "axes": {
+                "x": {
+                    "title": "Time to First Token (ms)",
+                    "min": 0
+                },
+                "y": {
+                    "title": "Prefill Throughput per GPU (tokens/s/GPU)",
+                    "min": 0
+                }
+            }
+        },
+        "table": {
+            "columns": [
+                "GPUs",
+                "TTFT (ms)",
+                "Throughput (tokens/s/GPU)",
+                "Action"
+            ],
+            "data": []
+        }
+    },
+    "decode": {
+        "chart": {
+            "datasets": [],
+            "target_line": {
+                "value": 0.0,
+                "label": "Target ITL: ? ms"
+            },
+            "axes": {
+                "x": {
+                    "title": "Inter Token Latency (ms)",
+                    "min": 0
+                },
+                "y": {
+                    "title": "Decode Throughput per GPU (tokens/s/GPU)",
+                    "min": 0
+                }
+            }
+        },
+        "table": {
+            "columns": [
+                "GPUs",
+                "ITL (ms)",
+                "Throughput (tokens/s/GPU)",
+                "Action"
+            ],
+            "data": []
+        }
+    },
+    "cost": {
+        "chart": {
+            "datasets": [],
+            "axes": {
+                "x": {
+                    "title": "Tokens per User",
+                    "min": 0
+                },
+                "y": {
+                    "title": "Cost ($)",
+                    "min": 0
+                }
+            },
+            "title": "Cost Per 1000 ? requests"
+        },
+        "table": {
+            "columns": [
+                "TTFT (ms)",
+                "Prefill Thpt (tokens/s/GPU)",
+                "ITL (ms)",
+                "Decode Thpt (tokens/s/GPU)",
+                "Tokens/User",
+                "Cost ($)",
+                "Action"
+            ],
+            "data": []
+        }
+    }
+}