From f1e1d251e9d9feb70ae2911794ee368e09be0726 Mon Sep 17 00:00:00 2001 From: Lite Ye Date: Thu, 7 Mar 2024 13:54:55 -0500 Subject: [PATCH] Bump the default max_num_seq to 2048 --- serve/mlc_serve/engine/base.py | 2 +- serve/mlc_serve/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/serve/mlc_serve/engine/base.py b/serve/mlc_serve/engine/base.py index 5ba3b74405..161553f65d 100644 --- a/serve/mlc_serve/engine/base.py +++ b/serve/mlc_serve/engine/base.py @@ -31,7 +31,7 @@ class MLCServeEngineConfig: # TODO(@sunggg): figure out better defaults use_staging_engine: bool = True max_num_batched_tokens: int = 4096 - max_num_seq: int = 256 + max_num_seq: int = 2048 max_num_seq_per_request: Optional[int] = None # default to `max_num_seq / 4` min_decode_steps: int = 32 max_decode_steps: int = 48 diff --git a/serve/mlc_serve/utils.py b/serve/mlc_serve/utils.py index e1d9ca80aa..dffdebf742 100644 --- a/serve/mlc_serve/utils.py +++ b/serve/mlc_serve/utils.py @@ -30,7 +30,7 @@ def get_default_mlc_serve_argparser(description="", allow_override=False): parser.add_argument("--use-sync-engine", action="store_true") parser.add_argument("--num-sequences-to-sample", type=int, default=1) parser.add_argument("--max-num-batched-tokens", type=int, default=4096) - parser.add_argument("--max-num-seq", type=int, default=256) + parser.add_argument("--max-num-seq", type=int, default=2048) parser.add_argument("--min-decode-steps", type=int, default=32) parser.add_argument("--max-decode-steps", type=int, default=56) parser.add_argument("--gpu-memory-utilization", type=float, default=0.9)