Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 46 additions & 34 deletions benchmarks/profiler/profile_sla.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
profile_prefill_aiconfigurator,
)
from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser
from benchmarks.profiler.webui.select_config import pick_config_with_webui
from deploy.utils.dynamo_deployment import (
DynamoDeploymentClient,
cleanup_remaining_deployments,
Expand Down Expand Up @@ -494,44 +495,55 @@ async def run_profile(args):
if not prefill_data.num_gpus:
logger.error("No prefill results produced; skipping recommendations.")

# select best parallel mapping for prefill
if min(prefill_data.ttft) > args.ttft:
logger.warning(
"No engine configuration satisfies the TTFT requirement, please try a smaller model or more powerful hardware"
if args.pick_with_webui:
# select best P/D config in webUI
selected_prefill_idx, selected_decode_idx = pick_config_with_webui(
prefill_data, decode_data, args
)
selected_prefill_idx = int(np.argmin(np.array(prefill_data.ttft)))
else:
valid_indices = [
i for i, ttft in enumerate(prefill_data.ttft) if ttft <= args.ttft
]
# Among valid TP sizes, select the one with highest throughput per GPU
valid_thpts = [prefill_data.thpt_per_gpu[i] for i in valid_indices]
max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
selected_prefill_idx = max_thpt_idx
logger.info(
f"Suggested prefill parallel mapping: {prefill_data.parallel_mapping_labels[selected_prefill_idx]} on {prefill_data.num_gpus[selected_prefill_idx]} GPU(s) (TTFT {prefill_data.ttft[selected_prefill_idx]:.2f} ms, throughput {prefill_data.thpt_per_gpu[selected_prefill_idx]:.2f} tokens/s/GPU)"
)
# automatically select P/D config within SLA with the highest throughput/GPU
# select best parallel mapping for prefill
if min(prefill_data.ttft) > args.ttft:
logger.warning(
"No engine configuration satisfies the TTFT requirement, please try a smaller model or more powerful hardware"
)
selected_prefill_idx = int(np.argmin(np.array(prefill_data.ttft)))
else:
valid_indices = [
i
for i, ttft in enumerate(prefill_data.ttft)
if ttft <= args.ttft
]
# Among valid TP sizes, select the one with highest throughput per GPU
valid_thpts = [prefill_data.thpt_per_gpu[i] for i in valid_indices]
max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
selected_prefill_idx = max_thpt_idx
logger.info(
f"Suggested prefill parallel mapping: {prefill_data.parallel_mapping_labels[selected_prefill_idx]} on {prefill_data.num_gpus[selected_prefill_idx]} GPU(s) (TTFT {prefill_data.ttft[selected_prefill_idx]:.2f} ms, throughput {prefill_data.thpt_per_gpu[selected_prefill_idx]:.2f} tokens/s/GPU)"
)

# select best parallel mapping for decode
if not decode_data.num_gpus:
logger.error("No decode results produced; skipping recommendations.")
return
if min(decode_data.itl) > args.itl:
logger.warning(
"No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware"
# select best parallel mapping for decode
if not decode_data.num_gpus:
logger.error(
"No decode results produced; skipping recommendations."
)
return
if min(decode_data.itl) > args.itl:
logger.warning(
"No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware"
)
selected_decode_idx = int(np.argmin(np.array(decode_data.itl)))
else:
valid_indices = [
i for i, itl in enumerate(decode_data.itl) if itl <= args.itl
]
# Among valid TP sizes, select the one with highest throughput per GPU
valid_thpts = [decode_data.thpt_per_gpu[i] for i in valid_indices]
max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
selected_decode_idx = max_thpt_idx
logger.info(
f"Suggested decode parallel mapping: {decode_data.parallel_mapping_labels[selected_decode_idx]} on {decode_data.num_gpus[selected_decode_idx]} GPU(s) (ITL {decode_data.itl[selected_decode_idx]:.2f} ms, throughput {decode_data.thpt_per_gpu[selected_decode_idx]:.2f} tokens/s/GPU)"
)
selected_decode_idx = int(np.argmin(np.array(decode_data.itl)))
else:
valid_indices = [
i for i, itl in enumerate(decode_data.itl) if itl <= args.itl
]
# Among valid TP sizes, select the one with highest throughput per GPU
valid_thpts = [decode_data.thpt_per_gpu[i] for i in valid_indices]
max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
selected_decode_idx = max_thpt_idx
logger.info(
f"Suggested decode parallel mapping: {decode_data.parallel_mapping_labels[selected_decode_idx]} on {decode_data.num_gpus[selected_decode_idx]} GPU(s) (ITL {decode_data.itl[selected_decode_idx]:.2f} ms, throughput {decode_data.thpt_per_gpu[selected_decode_idx]:.2f} tokens/s/GPU)"
)

if args.dry_run:
# use min value for prefill and decode GPU counts
Expand Down
3 changes: 3 additions & 0 deletions benchmarks/profiler/utils/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
AIPERF_PREFILL_BENCHMARK_OSL = 5
AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO = 4

# Cost calculation defaults
GPU_COST_PER_HOUR = 3.0 # Cost per GPU per hour in dollars
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this consistent among all our GPUs?



class EngineType(str, Enum):
PREFILL = "prefill"
Expand Down
30 changes: 18 additions & 12 deletions benchmarks/profiler/utils/pareto.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,39 @@

def compute_pareto(x, y):
"""
compute the pareto front (top-left is better) for the given x and y values
return sorted lists of the x and y values for the pareto front
Compute the pareto front (top-left is better) for the given x and y values.

Returns:
tuple: (xs, ys, indices) where:
- xs: list of x values on the pareto front
- ys: list of y values on the pareto front
- indices: list of original indices corresponding to the pareto points
"""
# Validate inputs
if x is None or y is None:
return [], []
return [], [], []

if len(x) != len(y):
raise ValueError("x and y must have the same length")

if len(x) == 0:
return [], []
return [], [], []

# Build point list and sort by x asc, then y desc so we prefer smaller x and larger y.
points = list(zip(x, y))
# Build point list with original indices and sort by x asc, then y desc
points = [(x[i], y[i], i) for i in range(len(x))]
points.sort(key=lambda p: (p[0], -p[1]))

# Single pass to keep only non-dominated points (minimize x, maximize y).
# Single pass to keep only non-dominated points (minimize x, maximize y)
pareto = []
max_y = float("-inf")
for px, py in points:
for px, py, idx in points:
if py > max_y:
pareto.append((px, py))
pareto.append((px, py, idx))
max_y = py

# Return sorted by x ascending for convenience
pareto.sort(key=lambda p: (p[0], p[1]))
xs = [px for px, _ in pareto]
ys = [py for _, py in pareto]
return xs, ys
xs = [px for px, _, _ in pareto]
ys = [py for _, py, _ in pareto]
indices = [idx for _, _, idx in pareto]
return xs, ys, indices
7 changes: 3 additions & 4 deletions benchmarks/profiler/utils/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from matplotlib import cm
from scipy.interpolate import griddata

from benchmarks.profiler.utils.defaults import GPU_COST_PER_HOUR
from benchmarks.profiler.utils.pareto import compute_pareto

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -297,13 +298,11 @@ def plot_pd_joint_results(isl, osl, prefill_data, decode_data, output_dir):
decode_data: DecodeProfileData instance containing profiling results
output_dir: directory to save the plot
"""
GPU_COST_PER_HOUR = 3.0 # $3/hour

# compute pareto front for prefill
p_ttft, p_thpt = compute_pareto(prefill_data.ttft, prefill_data.thpt_per_gpu)
p_ttft, p_thpt, _ = compute_pareto(prefill_data.ttft, prefill_data.thpt_per_gpu)

# compute pareto front for decode
d_itl, d_thpt = compute_pareto(decode_data.itl, decode_data.thpt_per_gpu)
d_itl, d_thpt, _ = compute_pareto(decode_data.itl, decode_data.thpt_per_gpu)

# convert to cost per thousand requests
p_ttft = np.array(p_ttft)
Expand Down
22 changes: 21 additions & 1 deletion benchmarks/profiler/utils/profiler_argparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import argparse
import ast
import os
from typing import Any, Dict

import yaml
Expand Down Expand Up @@ -84,6 +85,8 @@ def create_profiler_parser() -> argparse.Namespace:
aic_backend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
aic_backend_version: String (specify backend version when using aiconfigurator to estimate perf, default: None)
dry_run: Boolean (dry run the profile job, default: False)
pick_with_webui: Boolean (pick the best parallelization mapping using webUI, default: False)
webui_port: Int (webUI port, default: $PROFILER_WEBUI_PORT or 8000)
sla:
isl: Int (target input sequence length, default: 3000)
osl: Int (target output sequence length, default: 500)
Expand Down Expand Up @@ -113,6 +116,8 @@ def create_profiler_parser() -> argparse.Namespace:
help="Configuration as Python dict literal, YAML, or JSON string. CLI args override config values. "
"Example: \"{'engine': {'backend': 'vllm', 'config': '/path'}, 'sla': {'isl': 3000}}\"",
)

# CLI arguments with config-aware defaults (using nested .get() for cleaner code)
parser.add_argument(
"--model",
type=str,
Expand All @@ -126,7 +131,6 @@ def create_profiler_parser() -> argparse.Namespace:
help="Container image to use for DGD components (frontend, planner, workers). Overrides images in config file.",
)

# CLI arguments with config-aware defaults (using nested .get() for cleaner code)
parser.add_argument(
"--namespace",
type=str,
Expand Down Expand Up @@ -233,6 +237,22 @@ def create_profiler_parser() -> argparse.Namespace:
default=config.get("hardware", {}).get("enable_gpu_discovery", False),
help="Enable automatic GPU discovery from Kubernetes cluster nodes. When enabled, overrides any manually specified hardware configuration. Requires cluster-wide node access permissions.",
)
parser.add_argument(
"--pick-with-webui",
action="store_true",
default=config.get("sweep", {}).get("pick_with_webui", False),
help="Pick the best parallelization mapping using webUI",
)

default_webui_port = 8000
if os.environ.get("PROFILER_WEBUI_PORT"):
default_webui_port = int(os.environ.get("PROFILER_WEBUI_PORT"))
parser.add_argument(
"--webui-port",
type=int,
default=config.get("sweep", {}).get("webui_port", default_webui_port),
help="WebUI port",
)

# Dynamically add all planner arguments from planner_argparse.py
add_planner_arguments_to_parser(parser, prefix="planner-")
Expand Down
98 changes: 98 additions & 0 deletions benchmarks/profiler/webui/data_template.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
{
"settings": {
"allow_confirm_datapoint": true,
"hide_show_config": true
},
"prefill": {
"chart": {
"labels": [],
"datasets": [
{
"label": "Prefill Performance",
"data": [],
"backgroundColor": "#1f77b4",
"borderColor": "#1f77b4"
}
],
"target_line": {
"value": 0.0,
"label": "Target TTFT: ? ms"
},
"axes": {
"x": {
"title": "Time to First Token (ms)",
"min": 0
},
"y": {
"title": "Prefill Throughput per GPU (tokens/s/GPU)",
"min": 0
}
}
},
"table": {
"columns": [
"GPUs",
"TTFT (ms)",
"Throughput (tokens/s/GPU)",
"Action"
],
"data": []
}
},
"decode": {
"chart": {
"datasets": [],
"target_line": {
"value": 0.0,
"label": "Target ITL: ? ms"
},
"axes": {
"x": {
"title": "Inter Token Latency (ms)",
"min": 0
},
"y": {
"title": "Decode Throughput per GPU (tokens/s/GPU)",
"min": 0
}
}
},
"table": {
"columns": [
"GPUs",
"ITL (ms)",
"Throughput (tokens/s/GPU)",
"Action"
],
"data": []
}
},
"cost": {
"chart": {
"datasets": [],
"axes": {
"x": {
"title": "Tokens per User",
"min": 0
},
"y": {
"title": "Cost ($)",
"min": 0
}
},
"title": "Cost Per 1000 ? requests"
},
"table": {
"columns": [
"TTFT (ms)",
"Prefill Thpt (tokens/s/GPU)",
"ITL (ms)",
"Decode Thpt (tokens/s/GPU)",
"Tokens/User",
"Cost ($)",
"Action"
],
"data": []
}
}
}
Loading
Loading