Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

codegen: add vLLM as default inference engine #883

Merged
merged 1 commit into from
Mar 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions helm-charts/codegen/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ dependencies:
- name: tgi
version: 0-latest
repository: "file://../common/tgi"
condition: tgi.enabled
- name: vllm
version: 0-latest
repository: "file://../common/vllm"
condition: vllm.enabled
- name: llm-uservice
version: 0-latest
repository: "file://../common/llm-uservice"
Expand Down
12 changes: 8 additions & 4 deletions helm-charts/codegen/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,14 @@ helm dependency update codegen
export HFTOKEN="insert-your-huggingface-token-here"
export MODELDIR="/mnt/opea-models"
export MODELNAME="Qwen/Qwen2.5-Coder-7B-Instruct"
# To run on Xeon
helm install codegen codegen --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME}
# To run on Gaudi
#helm install codegen codegen --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f codegen/gaudi-values.yaml
# To use CPU with vLLM
helm install codegen codegen --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-uservcie.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} -f cpu-values.yaml
# To use CPU with TGI
# helm install codegen codegen --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-uservcie.LLM_MODEL_ID=${MODELNAME} --set tgi.LLM_MODEL_ID=${MODELNAME} -f cpu-tgi-values.yaml
# To use Gaudi device with vLLM
# helm install codegen codegen --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-uservcie.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} -f gaudi-values.yaml
# To use Gaudi device with TGI
# helm install codegen codegen --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-uservcie.LLM_MODEL_ID=${MODELNAME} --set tgi.LLM_MODEL_ID=${MODELNAME} -f gaudi-tgi-values.yaml
```

### IMPORTANT NOTE
Expand Down
9 changes: 9 additions & 0 deletions helm-charts/codegen/cpu-tgi-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

tgi:
enabled: true
vllm:
enabled: false
llm-uservice:
TEXTGEN_BACKEND: TGI
6 changes: 5 additions & 1 deletion helm-charts/codegen/cpu-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,8 @@
# SPDX-License-Identifier: Apache-2.0

tgi:
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
enabled: false
vllm:
enabled: true
llm-uservice:
TEXTGEN_BACKEND: vLLM
37 changes: 37 additions & 0 deletions helm-charts/codegen/gaudi-tgi-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
vllm:
enabled: false
llm-uservice:
TEXTGEN_BACKEND: TGI
42 changes: 18 additions & 24 deletions helm-charts/codegen/gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,26 @@
# SPDX-License-Identifier: Apache-2.0

tgi:
enabled: false

vllm:
enabled: true
accelDevice: "gaudi"
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
repository: opea/vllm-gaudi
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
startupProbe:
failureThreshold: 360
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
extraCmdArgs: [
"--tensor-parallel-size", "1",
"--block-size", "128",
"--max-num-seqs", "256",
]

llm-uservice:
TEXTGEN_BACKEND: vLLM
retryTimeoutSeconds: 720
2 changes: 1 addition & 1 deletion helm-charts/codegen/templates/tests/test-pod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ spec:
max_retry=20;
for ((i=1; i<=max_retry; i++)); do
curl http://{{ include "codegen.fullname" . }}:{{ .Values.service.port }}/v1/codegen -sS --fail-with-body \
-d '{"messages": "def print_hello_world():"}' \
-d '{"messages": "Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception."}' \
-H 'Content-Type: application/json' && break;
curlcode=$?
if [[ $curlcode -eq 7 ]]; then sleep 10; else echo "curl failed with code $curlcode"; exit 1; fi;
Expand Down
6 changes: 6 additions & 0 deletions helm-charts/codegen/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,15 @@ affinity: {}

# To override values in subchart tgi
tgi:
enabled: false
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct

vllm:
enabled: true
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct

llm-uservice:
TEXTGEN_BACKEND: vLLM
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct

nginx:
Expand Down
2 changes: 2 additions & 0 deletions helm-charts/valuefiles.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ codegen:
dest_dir: CodeGen/kubernetes/helm
values:
- cpu-values.yaml
- cpu-tgi-values.yaml
- gaudi-values.yaml
- gaudi-tgi-values.yaml
codetrans:
src_repo: GenAIInfra
src_dir: helm-charts/codetrans
Expand Down