Skip to content

Commit

Permalink
vllm and uv upgraded
Browse files Browse the repository at this point in the history
  • Loading branch information
ilkersigirci committed Aug 26, 2024
1 parent 267e536 commit b565c3d
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 14 deletions.
2 changes: 1 addition & 1 deletion .env
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
DEPLOYED_PYTHON_VERSION=3.10
DEPLOYED_PYTHON_VERSION=3.11
LIBRARY_BASE_PATH=/workspace/runpod-playground
HF_HOME=/workspace/runpod-playground/huggingface
HF_HUB_ENABLE_HF_TRANSFER=1
Expand Down
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
},
],
},
"python.envFile": "", // Disable auto importing .env file values
// "python.envFile": "", // Disable auto importing .env file values
"python.languageServer": "Pylance",
// "python.analysis.autoImportCompletions": true,
"autoDocstring.generateDocstringOnEnter": true,
Expand Down
86 changes: 86 additions & 0 deletions notebooks/vllm_playground.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## GGUF TEST\n",
"\n",
"```bash\n",
"# Download the model\n",
"huggingface-cli download $DEPLOYED_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME --include \"Codestral-22B-v0.1-Q8_0.gguf\"\n",
"huggingface-cli download $DEPLOYED_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME --include \"Codestral-22B-v0.1.imatrix\"\n",
"\n",
"\n",
"# Start the server\n",
"SERVED_MODEL_NAME=\"${DEPLOYED_MODEL_NAME#*/}\"\n",
"MODEL_PATH=$LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME/Codestral-22B-v0.1-Q8_0.gguf\n",
"GPU_COUNT=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits | head -n 1)\n",
"\n",
"python -m vllm.entrypoints.openai.api_server \\\n",
" --host 0.0.0.0 \\\n",
" --port 8000 \\\n",
" --enable-prefix-caching \\\n",
" --gpu-memory-utilization 0.97 \\\n",
" --tensor-parallel-size $GPU_COUNT \\\n",
" --max-model-len $MAX_CONTEXT_LEN \\\n",
" --model $MODEL_PATH \\\n",
" --served-model-name $SERVED_MODEL_NAME \\\n",
" --quantization gguf\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from huggingface_hub import hf_hub_download\n",
"from vllm import LLM, SamplingParams\n",
"\n",
"def run_gguf_inference(model_path):\n",
" llm = LLM(\n",
"\tmodel=model_path,\n",
"\tmax_model_len=4096,\n",
"\ttokenizer=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
"\ttensor_parallel_size=1, \n",
" )\n",
"\n",
" tokenizer = llm.get_tokenizer()\n",
" conversations = tokenizer.apply_chat_template(\n",
" [{'role': 'user', 'content': 'what is the future of AI?'}],\n",
" tokenize=False,\n",
" add_generation_prompt=True,\n",
" )\n",
"\n",
" outputs = llm.generate(\n",
" [conversations],\n",
" SamplingParams(temperature=0, max_tokens=1000),\n",
" )\n",
" for output in outputs:\n",
"\tprint(output)\n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" repo_id = \"bullerwins/Meta-Llama-3.1-8B-Instruct-GGUF\"\n",
" filename = \"Meta-Llama-3.1-8B-Instruct-Q2_K.gguf\"\n",
" model = hf_hub_download(repo_id, filename=filename)\n",
" run_gguf_inference(model)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ authors = [
]
dependencies = [
"python-dotenv>=1.0.1",
"huggingface-hub>=0.23.0",
"runpod>=1.6.2",
"huggingface-hub>=0.24.0",
"hf-transfer>=0.1.8",
]
readme = "README.md"
requires-python = ">= 3.11"
Expand All @@ -20,6 +21,8 @@ dev-dependencies = [
"ruff==0.6.2",
"ipykernel>=6.29.3",
"vllm>=0.5.5",
"setuptools>=73.0.1",
"modelscope>=1.17.1",
]

[tool.ruff]
Expand Down
6 changes: 6 additions & 0 deletions requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ gguf==0.9.1
h11==0.14.0
# via httpcore
# via uvicorn
hf-transfer==0.1.8
httpcore==1.0.5
# via httpx
httptools==0.6.1
Expand Down Expand Up @@ -193,6 +194,7 @@ matplotlib-inline==0.1.7
# via ipython
mdurl==0.1.2
# via markdown-it-py
modelscope==1.17.1
mpmath==1.3.0
# via sympy
msgpack==1.0.8
Expand Down Expand Up @@ -378,6 +380,7 @@ regex==2024.7.24
requests==2.32.3
# via datasets
# via huggingface-hub
# via modelscope
# via outlines
# via pooch
# via ray
Expand All @@ -403,6 +406,7 @@ scipy==1.14.1
# via scikit-learn
sentencepiece==0.2.0
# via vllm
setuptools==73.0.1
shellingham==1.5.4
# via typer
six==1.16.0
Expand Down Expand Up @@ -449,6 +453,7 @@ tqdm==4.66.5
# via datasets
# via gguf
# via huggingface-hub
# via modelscope
# via openai
# via outlines
# via tqdm-loggable
Expand Down Expand Up @@ -487,6 +492,7 @@ ujson==5.10.0
# via fastapi
urllib3==2.2.2
# via botocore
# via modelscope
# via requests
# via runpod
uvicorn==0.30.6
Expand Down
1 change: 1 addition & 0 deletions requirements.lock
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ fsspec==2024.6.1
h11==0.14.0
# via httpcore
# via uvicorn
hf-transfer==0.1.8
httpcore==1.0.5
# via httpx
httptools==0.6.1
Expand Down
17 changes: 10 additions & 7 deletions scripts/install_uv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ source /workspace/runpod-playground/.env

cd $LIBRARY_BASE_PATH

if ! command -v python$DEPLOYED_PYTHON_VERSION >/dev/null 2>&1; then
echo "******** Updating apt ********"
apt update -y -qq > /dev/null
echo "******** Installing python$DEPLOYED_PYTHON_VERSION ********"
add-apt-repository ppa:deadsnakes/ppa -y && apt update -y -qq > /dev/null
DEBIAN_FRONTEND=noninteractive TZ=Europe/Turkey apt install python$DEPLOYED_PYTHON_VERSION-full -y -qq > /dev/null
fi
# if ! command -v python$DEPLOYED_PYTHON_VERSION >/dev/null 2>&1; then
# echo "******** Updating apt ********"
# apt update -y -qq > /dev/null
# echo "******** Installing python$DEPLOYED_PYTHON_VERSION ********"
# add-apt-repository ppa:deadsnakes/ppa -y && apt update -y -qq > /dev/null
# DEBIAN_FRONTEND=noninteractive TZ=Europe/Turkey apt install python$DEPLOYED_PYTHON_VERSION-full -y -qq > /dev/null
# fi

# Install uv
if ! command -v uv >/dev/null 2>&1; then
Expand All @@ -20,6 +20,9 @@ if ! command -v uv >/dev/null 2>&1; then

source $HOME/.cargo/env bash

# echo "******** Installing python$DEPLOYED_PYTHON_VERSION ********"
# uv python install $DEPLOYED_PYTHON_VERSION

if [ ! -d $LIBRARY_BASE_PATH/.venv ]; then
echo "******** Creating virtual environment using uv with python$DEPLOYED_PYTHON_VERSION ********"
uv venv --python python$DEPLOYED_PYTHON_VERSION
Expand Down
8 changes: 4 additions & 4 deletions scripts/start_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,7 @@ fi

if ! uv pip show vllm >/dev/null 2>&1; then
echo "******** Installing vllm and its required dependencies ********"
# TODO: When vllm 0.5.5 is released, remove transformers pinning
uv pip install vllm==0.5.1 vllm-flash-attn==2.5.9 accelerate "numpy<2.0.0" setuptools "transformers<=4.42.4"
# uv pip install wheel
# uv pip install flash-attn==2.5.8 --no-build-isolation
uv pip install vllm==0.5.5 accelerate setuptools
# Alternative: From github main
# uv pip install git+https://github.com/vllm-project/vllm#main
fi
Expand All @@ -46,6 +43,9 @@ python -m vllm.entrypoints.openai.api_server \
--port 8000 \
--enable-prefix-caching \
--gpu-memory-utilization 0.97 \
--num-scheduler-steps 8 \
--use-v2-block-manager \
--disable-log-stats \
--tensor-parallel-size $GPU_COUNT \
--max-model-len $MAX_CONTEXT_LEN \
--model $MODEL_PATH \
Expand Down

0 comments on commit b565c3d

Please sign in to comment.