From 52f645e54df37fa7d29638e021f756422311dadb Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Sat, 22 Nov 2025 02:33:53 +0000 Subject: [PATCH] feat: add environment variable support for dynamic port configuration (part 1) Remove hardcoded --http-port=8000 flags from launch scripts. Frontend now accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000). Add DYN_SYSTEM_PORT fallback syntax ${DYN_SYSTEM_PORT:-8081} for worker ports. Multi-worker scenarios use distinct env vars (DYN_SYSTEM_PORT_PREFILL, DYN_SYSTEM_PORT_DECODE, DYN_SYSTEM_PORT_WORKER1, etc). This enables flexible port configuration in preparation for parallel testing while maintaining backward compatibility. Related: DIS-1022 Signed-off-by: Keiven Chang --- examples/backends/sglang/launch/agg.sh | 7 ++++--- examples/backends/sglang/launch/agg_embed.sh | 5 +++-- examples/backends/sglang/launch/agg_router.sh | 7 ++++--- examples/backends/sglang/launch/disagg.sh | 7 ++++--- examples/backends/sglang/launch/disagg_router.sh | 12 ++++++------ examples/backends/sglang/launch/disagg_same_gpu.sh | 7 ++++--- examples/backends/sglang/launch/multimodal_agg.sh | 3 ++- examples/backends/sglang/launch/multimodal_disagg.sh | 3 ++- examples/backends/trtllm/launch/agg.sh | 3 ++- examples/backends/trtllm/launch/agg_metrics.sh | 5 +++-- examples/backends/trtllm/launch/agg_router.sh | 3 ++- examples/backends/trtllm/launch/disagg.sh | 3 ++- examples/backends/trtllm/launch/disagg_multimodal.sh | 3 ++- examples/backends/trtllm/launch/disagg_router.sh | 3 ++- examples/backends/trtllm/launch/disagg_same_gpu.sh | 7 ++++--- examples/backends/trtllm/launch/epd_disagg.sh | 3 ++- examples/backends/trtllm/launch/gpt_oss_disagg.sh | 3 ++- .../performance_sweeps/scripts/start_frontend.sh | 3 ++- examples/backends/vllm/launch/agg.sh | 5 +++-- examples/backends/vllm/launch/agg_kvbm.sh | 3 ++- examples/backends/vllm/launch/agg_kvbm_router.sh | 2 +- examples/backends/vllm/launch/agg_lmcache.sh | 5 +++-- examples/backends/vllm/launch/agg_multimodal.sh | 5 +++-- examples/backends/vllm/launch/agg_multimodal_epd.sh | 3 ++- .../backends/vllm/launch/agg_multimodal_llama.sh | 3 ++- examples/backends/vllm/launch/agg_request_planes.sh | 5 +++-- examples/backends/vllm/launch/agg_router.sh | 2 +- examples/backends/vllm/launch/dep.sh | 3 ++- examples/backends/vllm/launch/disagg.sh | 3 ++- examples/backends/vllm/launch/disagg_kvbm.sh | 3 ++- examples/backends/vllm/launch/disagg_kvbm_2p2d.sh | 3 ++- examples/backends/vllm/launch/disagg_kvbm_router.sh | 2 +- examples/backends/vllm/launch/disagg_lmcache.sh | 3 ++- .../backends/vllm/launch/disagg_multimodal_epd.sh | 3 ++- .../backends/vllm/launch/disagg_multimodal_llama.sh | 3 ++- examples/backends/vllm/launch/disagg_router.sh | 2 +- examples/backends/vllm/launch/disagg_same_gpu.sh | 7 ++++--- examples/backends/vllm/launch/dsr1_dep.sh | 3 ++- .../multinode/trtllm/start_frontend_services.sh | 3 ++- tests/serve/launch/template_verifier.sh | 3 ++- 40 files changed, 98 insertions(+), 63 deletions(-) diff --git a/examples/backends/sglang/launch/agg.sh b/examples/backends/sglang/launch/agg.sh index 964a63283d..b498e59ea1 100755 --- a/examples/backends/sglang/launch/agg.sh +++ b/examples/backends/sglang/launch/agg.sh @@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then fi # run ingress +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) OTEL_SERVICE_NAME=dynamo-frontend \ -python3 -m dynamo.frontend --http-port=8000 & +python3 -m dynamo.frontend & DYNAMO_PID=$! -# run worker -OTEL_SERVICE_NAME=dynamo-worker DYN_SYSTEM_PORT=8081 \ +# run worker with metrics enabled +DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ python3 -m dynamo.sglang \ --model-path Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \ diff --git a/examples/backends/sglang/launch/agg_embed.sh b/examples/backends/sglang/launch/agg_embed.sh index edd761b7d5..9064273f30 100755 --- a/examples/backends/sglang/launch/agg_embed.sh +++ b/examples/backends/sglang/launch/agg_embed.sh @@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then fi # run ingress +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) OTEL_SERVICE_NAME=dynamo-frontend \ -python3 -m dynamo.frontend --http-port=8000 & +python3 -m dynamo.frontend & DYNAMO_PID=$! # run worker -OTEL_SERVICE_NAME=dynamo-worker-embedding DYN_SYSTEM_PORT=8081 \ +OTEL_SERVICE_NAME=dynamo-worker-embedding DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ python3 -m dynamo.sglang \ --embedding-worker \ --model-path Qwen/Qwen3-Embedding-4B \ diff --git a/examples/backends/sglang/launch/agg_router.sh b/examples/backends/sglang/launch/agg_router.sh index 659fbe3199..0b336f5f15 100755 --- a/examples/backends/sglang/launch/agg_router.sh +++ b/examples/backends/sglang/launch/agg_router.sh @@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then fi # run ingress +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) OTEL_SERVICE_NAME=dynamo-frontend \ -python3 -m dynamo.frontend --router-mode kv --http-port=8000 & +python3 -m dynamo.frontend --router-mode kv & DYNAMO_PID=$! # run worker -OTEL_SERVICE_NAME=dynamo-worker-1 DYN_SYSTEM_PORT=8081 \ +OTEL_SERVICE_NAME=dynamo-worker-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER1:-8081} \ python3 -m dynamo.sglang \ --model-path Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \ @@ -60,7 +61,7 @@ python3 -m dynamo.sglang \ --enable-metrics & WORKER_PID=$! -OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=8082 \ +OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER2:-8082} \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ --model-path Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \ diff --git a/examples/backends/sglang/launch/disagg.sh b/examples/backends/sglang/launch/disagg.sh index c0c0c00ca4..53e22fc723 100755 --- a/examples/backends/sglang/launch/disagg.sh +++ b/examples/backends/sglang/launch/disagg.sh @@ -44,12 +44,13 @@ if [ "$ENABLE_OTEL" = true ]; then fi # run ingress +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) OTEL_SERVICE_NAME=dynamo-frontend \ -python3 -m dynamo.frontend --http-port=8000 & +python3 -m dynamo.frontend & DYNAMO_PID=$! # run prefill worker -OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=8081 \ +OTEL_SERVICE_NAME=dynamo-worker-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL:-8081} \ python3 -m dynamo.sglang \ --model-path Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \ @@ -64,7 +65,7 @@ python3 -m dynamo.sglang \ PREFILL_PID=$! # run decode worker -OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=8082 \ +OTEL_SERVICE_NAME=dynamo-worker-decode DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_DECODE:-8082} \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ --model-path Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \ diff --git a/examples/backends/sglang/launch/disagg_router.sh b/examples/backends/sglang/launch/disagg_router.sh index 485123600c..916cbbf410 100755 --- a/examples/backends/sglang/launch/disagg_router.sh +++ b/examples/backends/sglang/launch/disagg_router.sh @@ -45,16 +45,16 @@ if [ "$ENABLE_OTEL" = true ]; then fi # run ingress +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) OTEL_SERVICE_NAME=dynamo-frontend \ python3 -m dynamo.frontend \ - --http-port=8000 \ --router-mode kv \ --kv-overlap-score-weight 0 \ --router-reset-states & DYNAMO_PID=$! # run prefill router -OTEL_SERVICE_NAME=dynamo-router-prefill DYN_SYSTEM_PORT=8081 \ +OTEL_SERVICE_NAME=dynamo-router-prefill DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL_ROUTER:-8081} \ python3 -m dynamo.router \ --endpoint dynamo.prefill.generate \ --block-size 64 \ @@ -63,7 +63,7 @@ python3 -m dynamo.router \ PREFILL_ROUTER_PID=$! # run prefill worker -OTEL_SERVICE_NAME=dynamo-worker-prefill-1 DYN_SYSTEM_PORT=8082 \ +OTEL_SERVICE_NAME=dynamo-worker-prefill-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL_WORKER1:-8082} \ python3 -m dynamo.sglang \ --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ @@ -78,7 +78,7 @@ python3 -m dynamo.sglang \ PREFILL_PID=$! # run prefill worker -OTEL_SERVICE_NAME=dynamo-worker-prefill-2 DYN_SYSTEM_PORT=8083 \ +OTEL_SERVICE_NAME=dynamo-worker-prefill-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL_WORKER2:-8083} \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ @@ -93,7 +93,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ PREFILL_PID=$! # run decode worker -OTEL_SERVICE_NAME=dynamo-worker-decode-1 DYN_SYSTEM_PORT=8084 \ +OTEL_SERVICE_NAME=dynamo-worker-decode-1 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_DECODE_WORKER1:-8084} \ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.sglang \ --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ @@ -108,7 +108,7 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.sglang \ PREFILL_PID=$! # run decode worker -OTEL_SERVICE_NAME=dynamo-worker-decode-2 DYN_SYSTEM_PORT=8085 \ +OTEL_SERVICE_NAME=dynamo-worker-decode-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_DECODE_WORKER2:-8085} \ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \ --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ diff --git a/examples/backends/sglang/launch/disagg_same_gpu.sh b/examples/backends/sglang/launch/disagg_same_gpu.sh index ba309e56a7..c04ca863a8 100755 --- a/examples/backends/sglang/launch/disagg_same_gpu.sh +++ b/examples/backends/sglang/launch/disagg_same_gpu.sh @@ -37,11 +37,12 @@ trap cleanup EXIT INT TERM # run ingress with KV router mode for disaggregated setup -python3 -m dynamo.frontend --router-mode kv --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python3 -m dynamo.frontend --router-mode kv & DYNAMO_PID=$! # run prefill worker with metrics on port 8081 -DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ python3 -m dynamo.sglang \ --model-path Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \ @@ -71,7 +72,7 @@ echo "Waiting for prefill worker to initialize..." sleep 5 # run decode worker with metrics on port 8082 (foreground) -DYN_SYSTEM_PORT=8082 \ +DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \ python3 -m dynamo.sglang \ --model-path Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \ diff --git a/examples/backends/sglang/launch/multimodal_agg.sh b/examples/backends/sglang/launch/multimodal_agg.sh index 93514c397b..735ef099d9 100755 --- a/examples/backends/sglang/launch/multimodal_agg.sh +++ b/examples/backends/sglang/launch/multimodal_agg.sh @@ -60,7 +60,8 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then fi # run ingress -python3 -m dynamo.frontend --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python3 -m dynamo.frontend & DYNAMO_PID=$! # run SGLang multimodal processor diff --git a/examples/backends/sglang/launch/multimodal_disagg.sh b/examples/backends/sglang/launch/multimodal_disagg.sh index 321b73abe1..8e0043c01f 100755 --- a/examples/backends/sglang/launch/multimodal_disagg.sh +++ b/examples/backends/sglang/launch/multimodal_disagg.sh @@ -60,7 +60,8 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then fi # run ingress -python3 -m dynamo.frontend --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python3 -m dynamo.frontend & DYNAMO_PID=$! # run SGLang multimodal processor diff --git a/examples/backends/trtllm/launch/agg.sh b/examples/backends/trtllm/launch/agg.sh index 56a842eb52..9318eb0857 100755 --- a/examples/backends/trtllm/launch/agg.sh +++ b/examples/backends/trtllm/launch/agg.sh @@ -22,7 +22,8 @@ trap cleanup EXIT INT TERM # run frontend -python3 -m dynamo.frontend --http-port 8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python3 -m dynamo.frontend & DYNAMO_PID=$! # run worker diff --git a/examples/backends/trtllm/launch/agg_metrics.sh b/examples/backends/trtllm/launch/agg_metrics.sh index 61671b4960..f7924e925a 100755 --- a/examples/backends/trtllm/launch/agg_metrics.sh +++ b/examples/backends/trtllm/launch/agg_metrics.sh @@ -19,11 +19,12 @@ cleanup() { trap cleanup EXIT INT TERM # Run frontend -python3 -m dynamo.frontend --http-port 8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python3 -m dynamo.frontend & DYNAMO_PID=$! # Run worker -DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ diff --git a/examples/backends/trtllm/launch/agg_router.sh b/examples/backends/trtllm/launch/agg_router.sh index 1b0568535a..26d5509839 100755 --- a/examples/backends/trtllm/launch/agg_router.sh +++ b/examples/backends/trtllm/launch/agg_router.sh @@ -19,7 +19,8 @@ trap cleanup EXIT INT TERM # run frontend -python3 -m dynamo.frontend --router-mode kv --http-port 8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python3 -m dynamo.frontend --router-mode kv & DYNAMO_PID=$! # run worker diff --git a/examples/backends/trtllm/launch/disagg.sh b/examples/backends/trtllm/launch/disagg.sh index 7f75ee908e..1a9efa4092 100755 --- a/examples/backends/trtllm/launch/disagg.sh +++ b/examples/backends/trtllm/launch/disagg.sh @@ -25,7 +25,8 @@ trap cleanup EXIT INT TERM # run frontend -python3 -m dynamo.frontend --http-port 8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python3 -m dynamo.frontend & DYNAMO_PID=$! # run prefill worker diff --git a/examples/backends/trtllm/launch/disagg_multimodal.sh b/examples/backends/trtllm/launch/disagg_multimodal.sh index 4fb0f74716..bf554eb6c4 100755 --- a/examples/backends/trtllm/launch/disagg_multimodal.sh +++ b/examples/backends/trtllm/launch/disagg_multimodal.sh @@ -23,7 +23,8 @@ trap cleanup EXIT INT TERM # run frontend -python3 -m dynamo.frontend --http-port 8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python3 -m dynamo.frontend & DYNAMO_PID=$! # run prefill worker diff --git a/examples/backends/trtllm/launch/disagg_router.sh b/examples/backends/trtllm/launch/disagg_router.sh index 1b005a44ae..0170eb7212 100755 --- a/examples/backends/trtllm/launch/disagg_router.sh +++ b/examples/backends/trtllm/launch/disagg_router.sh @@ -22,7 +22,8 @@ trap cleanup EXIT INT TERM # run frontend with KV routing for cache-aware optimization -python3 -m dynamo.frontend --router-mode kv --http-port 8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python3 -m dynamo.frontend --router-mode kv & DYNAMO_PID=$! # run prefill worker diff --git a/examples/backends/trtllm/launch/disagg_same_gpu.sh b/examples/backends/trtllm/launch/disagg_same_gpu.sh index 348c1ce61e..43b671de24 100755 --- a/examples/backends/trtllm/launch/disagg_same_gpu.sh +++ b/examples/backends/trtllm/launch/disagg_same_gpu.sh @@ -48,12 +48,13 @@ trap cleanup EXIT INT TERM # run frontend -python3 -m dynamo.frontend --http-port 8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python3 -m dynamo.frontend & DYNAMO_PID=$! # run prefill worker (shares GPU with decode) CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ -DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ @@ -65,7 +66,7 @@ PREFILL_PID=$! # run decode worker (shares GPU with prefill) CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ -DYN_SYSTEM_PORT=8082 \ +DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \ python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ diff --git a/examples/backends/trtllm/launch/epd_disagg.sh b/examples/backends/trtllm/launch/epd_disagg.sh index bce0c16be4..b98b8719a8 100755 --- a/examples/backends/trtllm/launch/epd_disagg.sh +++ b/examples/backends/trtllm/launch/epd_disagg.sh @@ -29,7 +29,8 @@ trap cleanup EXIT INT TERM # run frontend -python3 -m dynamo.frontend --http-port 8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python3 -m dynamo.frontend & DYNAMO_PID=$! # run encode worker diff --git a/examples/backends/trtllm/launch/gpt_oss_disagg.sh b/examples/backends/trtllm/launch/gpt_oss_disagg.sh index bbe560b231..cdbc083029 100755 --- a/examples/backends/trtllm/launch/gpt_oss_disagg.sh +++ b/examples/backends/trtllm/launch/gpt_oss_disagg.sh @@ -14,7 +14,8 @@ trap 'echo Cleaning up...; kill 0' EXIT # run frontend -python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python3 -m dynamo.frontend --router-mode round-robin & # With tensor_parallel_size=4, each worker needs 4 GPUs # run prefill worker diff --git a/examples/backends/trtllm/performance_sweeps/scripts/start_frontend.sh b/examples/backends/trtllm/performance_sweeps/scripts/start_frontend.sh index fba1855f68..5593a86dd3 100755 --- a/examples/backends/trtllm/performance_sweeps/scripts/start_frontend.sh +++ b/examples/backends/trtllm/performance_sweeps/scripts/start_frontend.sh @@ -20,6 +20,7 @@ etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0 sleep 2 # Start OpenAI Frontend which will dynamically discover workers when they startup +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # NOTE: This is a blocking call. -python3 -m dynamo.frontend --http-port 8000 +python3 -m dynamo.frontend diff --git a/examples/backends/vllm/launch/agg.sh b/examples/backends/vllm/launch/agg.sh index 696021d4b5..868b987451 100755 --- a/examples/backends/vllm/launch/agg.sh +++ b/examples/backends/vllm/launch/agg.sh @@ -5,9 +5,10 @@ set -e trap 'echo Cleaning up...; kill 0' EXIT # run ingress -python -m dynamo.frontend --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python -m dynamo.frontend & # run worker # --enforce-eager is added for quick deployment. for production use, need to remove this flag -DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --connector none diff --git a/examples/backends/vllm/launch/agg_kvbm.sh b/examples/backends/vllm/launch/agg_kvbm.sh index 7badd607d0..3997313bce 100755 --- a/examples/backends/vllm/launch/agg_kvbm.sh +++ b/examples/backends/vllm/launch/agg_kvbm.sh @@ -5,7 +5,8 @@ set -e trap 'echo Cleaning up...; kill 0' EXIT # run ingress -python -m dynamo.frontend --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python -m dynamo.frontend & # run worker with KVBM enabled # NOTE: remove --enforce-eager for production use diff --git a/examples/backends/vllm/launch/agg_kvbm_router.sh b/examples/backends/vllm/launch/agg_kvbm_router.sh index 972835039a..4145d4ea35 100755 --- a/examples/backends/vllm/launch/agg_kvbm_router.sh +++ b/examples/backends/vllm/launch/agg_kvbm_router.sh @@ -11,9 +11,9 @@ export PYTHONHASHSEED=0 MODEL="Qwen/Qwen3-0.6B" # run frontend + KV router +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) python -m dynamo.frontend \ --router-mode kv \ - --http-port 8000 \ --router-reset-states & # run workers with KVBM enabled diff --git a/examples/backends/vllm/launch/agg_lmcache.sh b/examples/backends/vllm/launch/agg_lmcache.sh index cfec901e9a..e5cac9ca74 100755 --- a/examples/backends/vllm/launch/agg_lmcache.sh +++ b/examples/backends/vllm/launch/agg_lmcache.sh @@ -5,8 +5,9 @@ set -e trap 'echo Cleaning up...; kill 0' EXIT # run ingress -python -m dynamo.frontend --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python -m dynamo.frontend & # run worker with LMCache enabled -DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache diff --git a/examples/backends/vllm/launch/agg_multimodal.sh b/examples/backends/vllm/launch/agg_multimodal.sh index 2bda87a347..d016980331 100755 --- a/examples/backends/vllm/launch/agg_multimodal.sh +++ b/examples/backends/vllm/launch/agg_multimodal.sh @@ -45,7 +45,8 @@ done export DYN_REQUEST_PLANE=tcp # Start frontend with Rust OpenAIPreprocessor -python -m dynamo.frontend --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python -m dynamo.frontend & # Configure GPU memory optimization for specific models EXTRA_ARGS="" @@ -59,7 +60,7 @@ fi # Multimodal data (images) are decoded in the backend worker using ImageLoader # --enforce-eager: Quick deployment (remove for production) # --connector none: No KV transfer needed for aggregated serving -DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS # Wait for all background processes to complete diff --git a/examples/backends/vllm/launch/agg_multimodal_epd.sh b/examples/backends/vllm/launch/agg_multimodal_epd.sh index 37052401ff..a94ab3c1f4 100755 --- a/examples/backends/vllm/launch/agg_multimodal_epd.sh +++ b/examples/backends/vllm/launch/agg_multimodal_epd.sh @@ -64,7 +64,8 @@ else fi # Start frontend (HTTP endpoint) -python -m dynamo.frontend --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python -m dynamo.frontend & # To make Qwen2.5-VL fit in A100 40GB, set the following extra arguments EXTRA_ARGS="" diff --git a/examples/backends/vllm/launch/agg_multimodal_llama.sh b/examples/backends/vllm/launch/agg_multimodal_llama.sh index 73c426f5b7..12cf8406c4 100755 --- a/examples/backends/vllm/launch/agg_multimodal_llama.sh +++ b/examples/backends/vllm/launch/agg_multimodal_llama.sh @@ -8,7 +8,8 @@ trap 'echo Cleaning up...; kill 0' EXIT MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" # run ingress -python -m dynamo.frontend --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python -m dynamo.frontend & # run processor python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n" & diff --git a/examples/backends/vllm/launch/agg_request_planes.sh b/examples/backends/vllm/launch/agg_request_planes.sh index 63f018a12d..b8a3b1b9a3 100755 --- a/examples/backends/vllm/launch/agg_request_planes.sh +++ b/examples/backends/vllm/launch/agg_request_planes.sh @@ -41,8 +41,9 @@ export DYN_REQUEST_PLANE=$REQUEST_PLANE echo "Using request plane mode: $REQUEST_PLANE" # Frontend -python -m dynamo.frontend --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python -m dynamo.frontend & -DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ DYN_HEALTH_CHECK_ENABLED=true \ python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --connector none diff --git a/examples/backends/vllm/launch/agg_router.sh b/examples/backends/vllm/launch/agg_router.sh index 9a5bfa741b..17c41c6d48 100755 --- a/examples/backends/vllm/launch/agg_router.sh +++ b/examples/backends/vllm/launch/agg_router.sh @@ -12,9 +12,9 @@ MODEL="Qwen/Qwen3-0.6B" BLOCK_SIZE=64 # run frontend + KV router +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) python -m dynamo.frontend \ --router-mode kv \ - --http-port 8000 \ --router-reset-states & # run workers diff --git a/examples/backends/vllm/launch/dep.sh b/examples/backends/vllm/launch/dep.sh index 0fc80a7f92..34d1fffde3 100755 --- a/examples/backends/vllm/launch/dep.sh +++ b/examples/backends/vllm/launch/dep.sh @@ -5,7 +5,8 @@ set -e trap 'echo Cleaning up...; kill 0' EXIT # run ingress -python -m dynamo.frontend --router-mode kv --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python -m dynamo.frontend --router-mode kv & # Data Parallel Attention / Expert Parallelism # Routing to DP workers managed by Dynamo diff --git a/examples/backends/vllm/launch/disagg.sh b/examples/backends/vllm/launch/disagg.sh index e06a21ecf6..7a5a2813a3 100755 --- a/examples/backends/vllm/launch/disagg.sh +++ b/examples/backends/vllm/launch/disagg.sh @@ -5,7 +5,8 @@ set -e trap 'echo Cleaning up...; kill 0' EXIT # run ingress -python -m dynamo.frontend --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python -m dynamo.frontend & # --enforce-eager is added for quick deployment. for production use, need to remove this flag CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & diff --git a/examples/backends/vllm/launch/disagg_kvbm.sh b/examples/backends/vllm/launch/disagg_kvbm.sh index 1e6509eef3..63bb90bab1 100755 --- a/examples/backends/vllm/launch/disagg_kvbm.sh +++ b/examples/backends/vllm/launch/disagg_kvbm.sh @@ -5,7 +5,8 @@ set -e trap 'echo Cleaning up...; kill 0' EXIT # run ingress -python -m dynamo.frontend --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python -m dynamo.frontend & # run decode worker on GPU 0, without enabling KVBM # NOTE: remove --enforce-eager for production use diff --git a/examples/backends/vllm/launch/disagg_kvbm_2p2d.sh b/examples/backends/vllm/launch/disagg_kvbm_2p2d.sh index af06470bdf..e41221acfe 100755 --- a/examples/backends/vllm/launch/disagg_kvbm_2p2d.sh +++ b/examples/backends/vllm/launch/disagg_kvbm_2p2d.sh @@ -5,7 +5,8 @@ set -e trap 'echo Cleaning up...; kill 0' EXIT # run ingress with KV router -python -m dynamo.frontend --router-mode kv --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python -m dynamo.frontend --router-mode kv & # run decode workers on GPU 0 and 1, without enabling KVBM # NOTE: remove --enforce-eager for production use diff --git a/examples/backends/vllm/launch/disagg_kvbm_router.sh b/examples/backends/vllm/launch/disagg_kvbm_router.sh index 86a63a9134..9e92abf106 100755 --- a/examples/backends/vllm/launch/disagg_kvbm_router.sh +++ b/examples/backends/vllm/launch/disagg_kvbm_router.sh @@ -10,9 +10,9 @@ export PYTHONHASHSEED=0 # Common configuration MODEL="Qwen/Qwen3-0.6B" +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) python -m dynamo.frontend \ --router-mode kv \ - --http-port 8000 \ --router-reset-states & # two decode workers (without KVBM) diff --git a/examples/backends/vllm/launch/disagg_lmcache.sh b/examples/backends/vllm/launch/disagg_lmcache.sh index 9c946c60b8..3734166698 100755 --- a/examples/backends/vllm/launch/disagg_lmcache.sh +++ b/examples/backends/vllm/launch/disagg_lmcache.sh @@ -5,7 +5,8 @@ set -e trap 'echo Cleaning up...; kill 0' EXIT # run ingress with KV router -python -m dynamo.frontend --router-mode kv --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python -m dynamo.frontend --router-mode kv & # run decode worker on GPU 0, without enabling LMCache CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B & diff --git a/examples/backends/vllm/launch/disagg_multimodal_epd.sh b/examples/backends/vllm/launch/disagg_multimodal_epd.sh index b392a83946..75b30abb8e 100755 --- a/examples/backends/vllm/launch/disagg_multimodal_epd.sh +++ b/examples/backends/vllm/launch/disagg_multimodal_epd.sh @@ -72,7 +72,8 @@ echo "==================================================" # Start frontend (no router mode) echo "Starting frontend..." -python -m dynamo.frontend --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python -m dynamo.frontend & # Start processor echo "Starting processor..." diff --git a/examples/backends/vllm/launch/disagg_multimodal_llama.sh b/examples/backends/vllm/launch/disagg_multimodal_llama.sh index f4a2707686..a4174c877c 100755 --- a/examples/backends/vllm/launch/disagg_multimodal_llama.sh +++ b/examples/backends/vllm/launch/disagg_multimodal_llama.sh @@ -45,7 +45,8 @@ MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" if [[ $HEAD_NODE -eq 1 ]]; then # run ingress - python -m dynamo.frontend --http-port=8000 & + # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) + python -m dynamo.frontend & # run processor python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n" & diff --git a/examples/backends/vllm/launch/disagg_router.sh b/examples/backends/vllm/launch/disagg_router.sh index 51921bbf8b..7382c9ab08 100755 --- a/examples/backends/vllm/launch/disagg_router.sh +++ b/examples/backends/vllm/launch/disagg_router.sh @@ -13,9 +13,9 @@ BLOCK_SIZE=64 # Start frontend with KV routing # The frontend will automatically detect prefill workers and activate an internal prefill router +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) python -m dynamo.frontend \ --router-mode kv \ - --http-port 8000 \ --router-reset-states & # two decode workers diff --git a/examples/backends/vllm/launch/disagg_same_gpu.sh b/examples/backends/vllm/launch/disagg_same_gpu.sh index 91a2dac47b..640a4e4e3a 100755 --- a/examples/backends/vllm/launch/disagg_same_gpu.sh +++ b/examples/backends/vllm/launch/disagg_same_gpu.sh @@ -42,12 +42,13 @@ cleanup() { trap cleanup EXIT INT TERM # run ingress -python3 -m dynamo.frontend --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python3 -m dynamo.frontend & DYNAMO_PID=$! # run decode worker with metrics on port 8081 # --enforce-eager is added for quick deployment. for production use, need to remove this flag -DYN_SYSTEM_PORT=8081 \ +DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ CUDA_VISIBLE_DEVICES=0 \ python3 -m dynamo.vllm \ --model Qwen/Qwen3-0.6B \ @@ -65,7 +66,7 @@ echo "Waiting for decode worker to initialize..." sleep 10 # run prefill worker with metrics on port 8082 (foreground) -DYN_SYSTEM_PORT=8082 \ +DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_PREFILL:-8082} \ DYN_VLLM_KV_EVENT_PORT=20081 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ CUDA_VISIBLE_DEVICES=0 \ diff --git a/examples/backends/vllm/launch/dsr1_dep.sh b/examples/backends/vllm/launch/dsr1_dep.sh index 95211d8b4f..226101e67a 100755 --- a/examples/backends/vllm/launch/dsr1_dep.sh +++ b/examples/backends/vllm/launch/dsr1_dep.sh @@ -82,8 +82,9 @@ echo " Model name: $MODEL" trap 'echo Cleaning up...; kill 0' EXIT # run ingress if it's node 0 +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) if [ $NODE_RANK -eq 0 ]; then - DYN_LOG=debug python -m dynamo.frontend --router-mode kv --http-port=8000 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log & + DYN_LOG=debug python -m dynamo.frontend --router-mode kv 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log & fi mkdir -p $LOG_DIR diff --git a/examples/basics/multinode/trtllm/start_frontend_services.sh b/examples/basics/multinode/trtllm/start_frontend_services.sh index 93eafbb06c..c9295d22cb 100755 --- a/examples/basics/multinode/trtllm/start_frontend_services.sh +++ b/examples/basics/multinode/trtllm/start_frontend_services.sh @@ -12,5 +12,6 @@ etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0 sleep 3 # Start OpenAI Frontend which will dynamically discover workers when they startup +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) # NOTE: This is a blocking call. -python3 -m dynamo.frontend --http-port 8000 +python3 -m dynamo.frontend diff --git a/tests/serve/launch/template_verifier.sh b/tests/serve/launch/template_verifier.sh index b07d5ac88f..61c843e876 100755 --- a/tests/serve/launch/template_verifier.sh +++ b/tests/serve/launch/template_verifier.sh @@ -17,7 +17,8 @@ cleanup() { trap cleanup EXIT INT TERM # run ingress -python3 -m dynamo.frontend --http-port=8000 & +# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000) +python3 -m dynamo.frontend & FRONTEND_PID=$! # run the mock worker + template validation generate()