Skip to content

Commit

Permalink
general improvement
Browse files Browse the repository at this point in the history
- SERVED_MODEL_NAME env variable introduced
- DEPLOYED_MODEL_NAME change to HF_MODEL_NAME
- ENABLE_AUTO_RESTART and ENABLE_TEAMS_NOTIFICATION env variables added
  • Loading branch information
ilkersigirci committed Sep 25, 2024
1 parent c5db6ef commit 4c47512
Show file tree
Hide file tree
Showing 14 changed files with 54 additions and 38 deletions.
5 changes: 4 additions & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@ HF_HOME=/workspace/runpod-playground/huggingface
HF_HUB_ENABLE_HF_TRANSFER=1
# HF_TOKEN=TO_BE_FILLED
# RUNPOD_API_KEY=TO_BE_FILLED
DEPLOYED_MODEL_NAME=alpindale/c4ai-command-r-plus-GPTQ
HF_MODEL_NAME=alpindale/c4ai-command-r-plus-GPTQ
SERVED_MODEL_NAME=c4ai-command-r-plus-GPTQ
MAX_CONTEXT_LEN=32000
ENABLE_HEALTH_CHECK=1
ENABLE_AUTO_RESTART=0
ENABLE_TEAMS_NOTIFICATION=1
API_ENDPOINT=http://0.0.0.0:8000
TEAMS_WEBHOOK_URL=DUMMY
TEAMS_MESSAGE_TITLE=MODEL
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -267,4 +267,5 @@ cython_debug/
/.ruff_cache/
models
huggingface
*.txt
*.txt
.env.local
26 changes: 14 additions & 12 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ SHELL=/bin/bash

LIBRARY_BASE_PATH=/workspace/runpod-playground
PYTHON=python
DEPLOYED_MODEL_NAME=alpindale/c4ai-command-r-plus-GPTQ
MAX_CONTEXT_LEN=32000

.PHONY: help install gui
.DEFAULT_GOAL=help
Expand All @@ -30,26 +28,29 @@ install-uv:
! command -v uv &> /dev/null && curl -LsSf https://astral.sh/uv/install.sh | sh
# echo '. "$$HOME/.cargo/env"' >> ~/.bashrc

install: ## Installs the development version of the package
install-package: ## Installs the development version of the package
$(MAKE) install-uv
uv sync --frozen

change-model-env: ## Change the model that is specified in the .env file
# sed -i 's/DEPLOYED_MODEL_NAME=alpindale\/WizardLM-2-8x22B/DEPLOYED_MODEL_NAME=CohereForAI\/c4ai-command-r-v01/g' .env
sed -i '/DEPLOYED_MODEL_NAME=/d' .env
echo "DEPLOYED_MODEL_NAME=${DEPLOYED_MODEL_NAME}" >> .env
# sed -i 's/HF_MODEL_NAME=alpindale\/WizardLM-2-8x22B/HF_MODEL_NAME=CohereForAI\/c4ai-command-r-v01/g' .env
sed -i '/HF_MODEL_NAME=/d' .env
echo "HF_MODEL_NAME=${HF_MODEL_NAME}" >> .env

change-max-context-len-env: ## Change the max context length that is specified in the .env file
# sed -i 's/MAX_CONTEXT_LEN=32000/MAX_CONTEXT_LEN=40000/g' .env
sed -i '/MAX_CONTEXT_LEN=/d' .env
echo "MAX_CONTEXT_LEN=${MAX_CONTEXT_LEN}" >> .env

initial-runpod-install: ## Install necessary tools and packages for Runpod, also install project dependencies
nohup bash ${LIBRARY_BASE_PATH}/scripts/initial_install.sh > initial_runpod_install_$(shell date +%Y%m%d_%H%M%S).txt 2>&1 &

download-model: ## Download the model that is specified in the .env file
nohup bash ${LIBRARY_BASE_PATH}/scripts/download_model.sh > download_model_log.txt 2>&1 &
nohup bash ${LIBRARY_BASE_PATH}/scripts/download_model.sh > download_model_log_$(shell date +%Y%m%d_%H%M%S).txt 2>&1 &

start-vllm: ## Start the VLLM server
nohup bash ${LIBRARY_BASE_PATH}/scripts/start_vllm.sh > vllm_log.txt 2>&1 &
nohup bash ${LIBRARY_BASE_PATH}/scripts/run_preodically_basic.sh ${LIBRARY_BASE_PATH}/scripts/healthcheck_model_api.sh > healthcheck_periodically.txt 300 2>&1 &
nohup bash ${LIBRARY_BASE_PATH}/scripts/start_vllm.sh > vllm_log_$(shell date +%Y%m%d_%H%M%S).txt 2>&1 &
nohup bash ${LIBRARY_BASE_PATH}/scripts/run_preodically_basic.sh ${LIBRARY_BASE_PATH}/scripts/healthcheck_model_api.sh > healthcheck_periodically_$(shell date +%Y%m%d_%H%M%S).txt 300 2>&1 &

stop-vllm: ## Stop the VLLM server
pkill -f 'run_preodically_basic|vllm.entrypoints'
Expand All @@ -58,8 +59,9 @@ restart-vllm: ## Stops and starts the VLLM server
$(MAKE) stop-vllm
$(MAKE) start-vllm

log-vllm: ## Show the log of the VLLM server
tail -f -n 100 vllm_log.txt
log-vllm: ## Show the log of the VLLM server, only the last log file
@last_log_file=$(shell ls -t vllm_log_*.txt | head -n 1); \
tail -f -n 100 $$last_log_file

send-chat-message: ## Send a chat message to the VLLM server
bash ${LIBRARY_BASE_PATH}/scripts/send_api_chat_message.sh send_message_with_system
Expand All @@ -72,4 +74,4 @@ gui: ## Start the GUI
--server.port 5000 \
--server.enableCORS=false \
--server.enableXsrfProtection=false \
runpod_playground/gui/main.py > streamlit_log.txt 2>&1 &
runpod_playground/gui/main.py > streamlit_log_$(shell date +%Y%m%d_%H%M%S).txt 2>&1 &
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,13 @@ make restart-vllm
make gui
```

- To deploy different model, in `.env` file, change `DEPLOYED_MODEL_NAME` variable to the model name you want to deploy by following hunggingface repository id convention.
- To deploy different model, in `.env` file, change `HF_MODEL_NAME` variable to the model name you want to deploy by following hunggingface repository id convention.
- Also you can change `SERVED_MODEL_NAME` to specify model name for requests.
- One can also change `MAX_CONTEXT_LEN` variable to the desired context length.
- Example: Change default model and its context length to CohereForAI/c4ai-command-r-v01
- Example: Change default model and its context length to CohereForAI/c4ai-command-r-plus-GPTQ

```bash
make change-model-env DEPLOYED_MODEL_NAME=CohereForAI/c4ai-command-r-v01
make change-model-env HF_MODEL_NAME=CohereForAI/c4ai-command-r-plus-GPTQ
make change-max-context-len-env MAX_CONTEXT_LEN=40000

```
Expand Down
6 changes: 3 additions & 3 deletions notebooks/vllm_playground.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
"\n",
"```bash\n",
"# Download the model\n",
"huggingface-cli download $DEPLOYED_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME --include \"Codestral-22B-v0.1-Q8_0.gguf\"\n",
"huggingface-cli download $DEPLOYED_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME --include \"Codestral-22B-v0.1.imatrix\"\n",
"huggingface-cli download $HF_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME --include \"Codestral-22B-v0.1-Q8_0.gguf\"\n",
"huggingface-cli download $HF_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME --include \"Codestral-22B-v0.1.imatrix\"\n",
"\n",
"\n",
"# Start the server\n",
"SERVED_MODEL_NAME=\"${DEPLOYED_MODEL_NAME#*/}\"\n",
"SERVED_MODEL_NAME=\"${HF_MODEL_NAME#*/}\"\n",
"MODEL_PATH=$LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME/Codestral-22B-v0.1-Q8_0.gguf\n",
"GPU_COUNT=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits | head -n 1)\n",
"\n",
Expand Down
Empty file.
Empty file.
4 changes: 2 additions & 2 deletions runpod_playground/download_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ def download_model_hf(
if __name__ == "__main__":
load_dotenv()

DEPLOYED_MODEL_NAME = os.getenv("DEPLOYED_MODEL_NAME", "alpindale/WizardLM-2-8x22B")
HF_MODEL_NAME = os.getenv("HF_MODEL_NAME", "alpindale/c4ai-command-r-plus-GPTQ")
revision = "main"
# revision = "6.0bpw"

# ignore_patterns = ["*.pt"]
ignore_patterns = ["*.pt", "*.bin"]

download_model_hf(
repo_id=DEPLOYED_MODEL_NAME, revision=revision, ignore_patterns=ignore_patterns
repo_id=HF_MODEL_NAME, revision=revision, ignore_patterns=ignore_patterns
)
9 changes: 4 additions & 5 deletions runpod_playground/gui/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,16 @@
import streamlit as st
from openai import OpenAI

DEPLOYED_MODEL_NAME = os.getenv("DEPLOYED_MODEL_NAME", None)
HF_MODEL_NAME = os.getenv("HF_MODEL_NAME", None)
SERVED_MODEL_NAME = os.getenv("SERVED_MODEL_NAME", None)
API_ENDPOINT = os.getenv("API_ENDPOINT", None)

if DEPLOYED_MODEL_NAME is None or API_ENDPOINT is None:
if HF_MODEL_NAME is None or SERVED_MODEL_NAME is None or API_ENDPOINT is None:
st.error(
"Please set the DEPLOYED_MODEL_NAME and API_ENDPOINT environment variables."
"Please set the HF_MODEL_NAME, SERVED_MODEL_NAME and API_ENDPOINT environment variables."
)
st.stop()

SERVED_MODEL_NAME = DEPLOYED_MODEL_NAME.split("/")[-1]

st.title("VLLM Server Test")

client = OpenAI(api_key="NONE", base_url=f"{API_ENDPOINT}/v1")
Expand Down
4 changes: 2 additions & 2 deletions scripts/download_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ source $HOME/.cargo/env bash
source $LIBRARY_BASE_PATH/.venv/bin/activate

# Download model if not already present
SERVED_MODEL_NAME="${DEPLOYED_MODEL_NAME#*/}"
# SERVED_MODEL_NAME="${HF_MODEL_NAME#*/}"

if [ ! $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME ]; then
echo "******** $SERVED_MODEL_NAME already downloaded ********"
Expand All @@ -26,4 +26,4 @@ if ! uv pip show hf_transfer >/dev/null 2>&1; then
fi

echo "******** Downloading model ********"
huggingface-cli download $DEPLOYED_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME
huggingface-cli download $HF_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME
20 changes: 14 additions & 6 deletions scripts/healthcheck_model_api.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,18 @@ RESPONSE=$(send_guided_regex_message)

# Check if the curl command timed out
if echo "$RESPONSE" | grep -q "Request timed out."; then
pkill -f vllm.entrypoints
nohup bash ${LIBRARY_BASE_PATH}/scripts/start_vllm.sh > vllm_log.txt 2>&1 &
if [ "$ENABLE_AUTO_RESTART" = "1" ]; then
pkill -f vllm.entrypoints
nohup bash ${LIBRARY_BASE_PATH}/scripts/start_vllm.sh > vllm_log.txt 2>&1 &
fi

# Send message to Teams Chat
MESSAGE="Request timed out. Hence, the model api is restarted."
TITLE="${TEAMS_MESSAGE_TITLE} - POD FAILURE"

send_teams_message "$TEAMS_WEBHOOK_URL" "$MESSAGE" "$TITLE"
if [ "$ENABLE_TEAMS_NOTIFICATION" = "1" ]; then
send_teams_message "$TEAMS_WEBHOOK_URL" "$MESSAGE" "$TITLE"
fi

exit 1
fi
Expand All @@ -46,11 +50,15 @@ fi

echo "API response did not contain '200 OK'."

pkill -f vllm.entrypoints
nohup bash ${LIBRARY_BASE_PATH}/scripts/start_vllm.sh > vllm_log.txt 2>&1 &
if [ "$ENABLE_AUTO_RESTART" = "1" ]; then
pkill -f vllm.entrypoints
nohup bash ${LIBRARY_BASE_PATH}/scripts/start_vllm.sh > vllm_log.txt 2>&1 &
fi

# Send message to Teams Chat
MESSAGE="The model didn't correctly respond. Hence, the model api is restarted."
TITLE="${TEAMS_MESSAGE_TITLE} - POD FAILURE"

send_teams_message "$TEAMS_WEBHOOK_URL" "$MESSAGE" "$TITLE"
if [ "$ENABLE_TEAMS_NOTIFICATION" = "1" ]; then
send_teams_message "$TEAMS_WEBHOOK_URL" "$MESSAGE" "$TITLE"
fi
5 changes: 5 additions & 0 deletions scripts/run_preodically_basic.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
# Load .env file
source $(dirname "$(realpath "$0")")/../.env

if [ "$ENABLE_HEALTH_CHECK" = "0" ]; then
echo "HEALTH CHECK IS DISABLED."
exit 1
fi

# Define the full path to your script
SCRIPT_PATH="$1"

Expand Down
2 changes: 0 additions & 2 deletions scripts/send_api_chat_message.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
# Load .env file
source $(dirname "$(realpath "$0")")/../.env

SERVED_MODEL_NAME="${DEPLOYED_MODEL_NAME#*/}"

send_health_check_message() {
local response=$(curl -s -o /dev/null -w "%{http_code}" "$API_ENDPOINT/health")

Expand Down
1 change: 0 additions & 1 deletion scripts/start_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ source $HOME/.cargo/env bash
source $LIBRARY_BASE_PATH/.venv/bin/activate

# Download model if not already present
SERVED_MODEL_NAME="${DEPLOYED_MODEL_NAME#*/}"
if [ ! -d $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME ]; then
bash $LIBRARY_BASE_PATH/scripts/download_model.sh
fi
Expand Down

0 comments on commit 4c47512

Please sign in to comment.