Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions examples/backends/sglang/launch/agg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then
fi

# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend --http-port=8000 &
python3 -m dynamo.frontend &
DYNAMO_PID=$!

# run worker
OTEL_SERVICE_NAME=dynamo-worker DYN_SYSTEM_PORT=8081 \
# run worker with metrics enabled
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
Expand Down
5 changes: 3 additions & 2 deletions examples/backends/sglang/launch/agg_embed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then
fi

# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend --http-port=8000 &
python3 -m dynamo.frontend &
DYNAMO_PID=$!

# run worker
OTEL_SERVICE_NAME=dynamo-worker-embedding DYN_SYSTEM_PORT=8081 \
OTEL_SERVICE_NAME=dynamo-worker-embedding DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python3 -m dynamo.sglang \
--embedding-worker \
--model-path Qwen/Qwen3-Embedding-4B \
Expand Down
7 changes: 4 additions & 3 deletions examples/backends/sglang/launch/agg_router.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then
fi

# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend --router-mode kv --http-port=8000 &
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!

# run worker
OTEL_SERVICE_NAME=dynamo-worker-1 DYN_SYSTEM_PORT=8081 \
OTEL_SERVICE_NAME=dynamo-worker-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER1:-8081} \
python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
Expand All @@ -60,7 +61,7 @@ python3 -m dynamo.sglang \
--enable-metrics &
WORKER_PID=$!

OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=8082 \
OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER2:-8082} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
Expand Down
7 changes: 4 additions & 3 deletions examples/backends/sglang/launch/disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then
fi

# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend --http-port=8000 &
python3 -m dynamo.frontend &
DYNAMO_PID=$!

# run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=8081 \
OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL:-8081} \
python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
Expand All @@ -64,7 +65,7 @@ python3 -m dynamo.sglang \
PREFILL_PID=$!

# run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=8082 \
OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_DECODE:-8082} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
Expand Down
12 changes: 6 additions & 6 deletions examples/backends/sglang/launch/disagg_router.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,16 @@ if [ "$ENABLE_OTEL" = true ]; then
fi

# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend \
--http-port=8000 \
--router-mode kv \
--kv-overlap-score-weight 0 \
--router-reset-states &
DYNAMO_PID=$!

# run prefill router
OTEL_SERVICE_NAME=dynamo-router-prefill DYN_SYSTEM_PORT=8081 \
OTEL_SERVICE_NAME=dynamo-router-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL_ROUTER:-8081} \
python3 -m dynamo.router \
--endpoint dynamo.prefill.generate \
--block-size 64 \
Expand All @@ -63,7 +63,7 @@ python3 -m dynamo.router \
PREFILL_ROUTER_PID=$!

# run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill-1 DYN_SYSTEM_PORT=8082 \
OTEL_SERVICE_NAME=dynamo-worker-prefill-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL_WORKER1:-8082} \
python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
Expand All @@ -78,7 +78,7 @@ python3 -m dynamo.sglang \
PREFILL_PID=$!

# run prefill worker
OTEL_SERVICE_NAME=dynamo-worker-prefill-2 DYN_SYSTEM_PORT=8083 \
OTEL_SERVICE_NAME=dynamo-worker-prefill-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL_WORKER2:-8083} \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
Expand All @@ -93,7 +93,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
PREFILL_PID=$!

# run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode-1 DYN_SYSTEM_PORT=8084 \
OTEL_SERVICE_NAME=dynamo-worker-decode-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_DECODE_WORKER1:-8084} \
CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
Expand All @@ -108,7 +108,7 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.sglang \
PREFILL_PID=$!

# run decode worker
OTEL_SERVICE_NAME=dynamo-worker-decode-2 DYN_SYSTEM_PORT=8085 \
OTEL_SERVICE_NAME=dynamo-worker-decode-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_DECODE_WORKER2:-8085} \
CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
Expand Down
7 changes: 4 additions & 3 deletions examples/backends/sglang/launch/disagg_same_gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,12 @@ trap cleanup EXIT INT TERM


# run ingress with KV router mode for disaggregated setup
python3 -m dynamo.frontend --router-mode kv --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!

# run prefill worker with metrics on port 8081
DYN_SYSTEM_PORT=8081 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
Expand Down Expand Up @@ -71,7 +72,7 @@ echo "Waiting for prefill worker to initialize..."
sleep 5

# run decode worker with metrics on port 8082 (foreground)
DYN_SYSTEM_PORT=8082 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
python3 -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
Expand Down
3 changes: 2 additions & 1 deletion examples/backends/sglang/launch/multimodal_agg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then
fi

# run ingress
python3 -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!

# run SGLang multimodal processor
Expand Down
3 changes: 2 additions & 1 deletion examples/backends/sglang/launch/multimodal_disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then
fi

# run ingress
python3 -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!

# run SGLang multimodal processor
Expand Down
3 changes: 2 additions & 1 deletion examples/backends/trtllm/launch/agg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ trap cleanup EXIT INT TERM


# run frontend
python3 -m dynamo.frontend --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!

# run worker
Expand Down
5 changes: 3 additions & 2 deletions examples/backends/trtllm/launch/agg_metrics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ cleanup() {
trap cleanup EXIT INT TERM

# Run frontend
python3 -m dynamo.frontend --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!

# Run worker
DYN_SYSTEM_PORT=8081 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
Expand Down
3 changes: 2 additions & 1 deletion examples/backends/trtllm/launch/agg_router.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ trap cleanup EXIT INT TERM


# run frontend
python3 -m dynamo.frontend --router-mode kv --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!

# run worker
Expand Down
3 changes: 2 additions & 1 deletion examples/backends/trtllm/launch/disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ trap cleanup EXIT INT TERM


# run frontend
python3 -m dynamo.frontend --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!

# run prefill worker
Expand Down
3 changes: 2 additions & 1 deletion examples/backends/trtllm/launch/disagg_multimodal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ trap cleanup EXIT INT TERM


# run frontend
python3 -m dynamo.frontend --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!

# run prefill worker
Expand Down
3 changes: 2 additions & 1 deletion examples/backends/trtllm/launch/disagg_router.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ trap cleanup EXIT INT TERM


# run frontend with KV routing for cache-aware optimization
python3 -m dynamo.frontend --router-mode kv --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!

# run prefill worker
Expand Down
7 changes: 4 additions & 3 deletions examples/backends/trtllm/launch/disagg_same_gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,13 @@ trap cleanup EXIT INT TERM


# run frontend
python3 -m dynamo.frontend --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!

# run prefill worker (shares GPU with decode)
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_PORT=8081 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
Expand All @@ -65,7 +66,7 @@ PREFILL_PID=$!

# run decode worker (shares GPU with prefill)
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_PORT=8082 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
Expand Down
3 changes: 2 additions & 1 deletion examples/backends/trtllm/launch/epd_disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ trap cleanup EXIT INT TERM


# run frontend
python3 -m dynamo.frontend --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$!

# run encode worker
Expand Down
3 changes: 2 additions & 1 deletion examples/backends/trtllm/launch/gpt_oss_disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ trap 'echo Cleaning up...; kill 0' EXIT


# run frontend
python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode round-robin &

# With tensor_parallel_size=4, each worker needs 4 GPUs
# run prefill worker
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0
sleep 2

# Start OpenAI Frontend which will dynamically discover workers when they startup
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
# NOTE: This is a blocking call.
python3 -m dynamo.frontend --http-port 8000
python3 -m dynamo.frontend

5 changes: 3 additions & 2 deletions examples/backends/vllm/launch/agg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT

# run ingress
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &

# run worker
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=8081 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --connector none
3 changes: 2 additions & 1 deletion examples/backends/vllm/launch/agg_kvbm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT

# run ingress
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &

# run worker with KVBM enabled
# NOTE: remove --enforce-eager for production use
Expand Down
2 changes: 1 addition & 1 deletion examples/backends/vllm/launch/agg_kvbm_router.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ export PYTHONHASHSEED=0
MODEL="Qwen/Qwen3-0.6B"

# run frontend + KV router
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend \
--router-mode kv \
--http-port 8000 \
--router-reset-states &

# run workers with KVBM enabled
Expand Down
5 changes: 3 additions & 2 deletions examples/backends/vllm/launch/agg_lmcache.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT

# run ingress
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &

# run worker with LMCache enabled
DYN_SYSTEM_PORT=8081 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
5 changes: 3 additions & 2 deletions examples/backends/vllm/launch/agg_multimodal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ done
export DYN_REQUEST_PLANE=tcp

# Start frontend with Rust OpenAIPreprocessor
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &

# Configure GPU memory optimization for specific models
EXTRA_ARGS=""
Expand All @@ -59,7 +60,7 @@ fi
# Multimodal data (images) are decoded in the backend worker using ImageLoader
# --enforce-eager: Quick deployment (remove for production)
# --connector none: No KV transfer needed for aggregated serving
DYN_SYSTEM_PORT=8081 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS

# Wait for all background processes to complete
Expand Down
3 changes: 2 additions & 1 deletion examples/backends/vllm/launch/agg_multimodal_epd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ else
fi

# Start frontend (HTTP endpoint)
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &

# To make Qwen2.5-VL fit in A100 40GB, set the following extra arguments
EXTRA_ARGS=""
Expand Down
3 changes: 2 additions & 1 deletion examples/backends/vllm/launch/agg_multimodal_llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ trap 'echo Cleaning up...; kill 0' EXIT
MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"

# run ingress
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &

# run processor
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
Expand Down
5 changes: 3 additions & 2 deletions examples/backends/vllm/launch/agg_request_planes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,9 @@ export DYN_REQUEST_PLANE=$REQUEST_PLANE
echo "Using request plane mode: $REQUEST_PLANE"

# Frontend
python -m dynamo.frontend --http-port=8000 &
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &

DYN_SYSTEM_PORT=8081 \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
DYN_HEALTH_CHECK_ENABLED=true \
python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --connector none
Loading
Loading