general improvement

- SERVED_MODEL_NAME env variable introduced - DEPLOYED_MODEL_NAME change to HF_MODEL_NAME - ENABLE_AUTO_RESTART and ENABLE_TEAMS_NOTIFICATION env variables added
ilkersigirci · Sep 25, 2024 · 4c47512 · 4c47512
1 parent c5db6ef
commit 4c47512
Show file tree

Hide file tree

Showing 14 changed files with 54 additions and 38 deletions.
diff --git a/.env b/.env
@@ -4,9 +4,12 @@ HF_HOME=/workspace/runpod-playground/huggingface
 HF_HUB_ENABLE_HF_TRANSFER=1
 # HF_TOKEN=TO_BE_FILLED
 # RUNPOD_API_KEY=TO_BE_FILLED
-DEPLOYED_MODEL_NAME=alpindale/c4ai-command-r-plus-GPTQ
+HF_MODEL_NAME=alpindale/c4ai-command-r-plus-GPTQ
+SERVED_MODEL_NAME=c4ai-command-r-plus-GPTQ
 MAX_CONTEXT_LEN=32000
 ENABLE_HEALTH_CHECK=1
+ENABLE_AUTO_RESTART=0
+ENABLE_TEAMS_NOTIFICATION=1
 API_ENDPOINT=http://0.0.0.0:8000
 TEAMS_WEBHOOK_URL=DUMMY
 TEAMS_MESSAGE_TITLE=MODEL
diff --git a/.gitignore b/.gitignore
@@ -267,4 +267,5 @@ cython_debug/
 /.ruff_cache/
 models
 huggingface
-*.txt
+*.txt
+.env.local
diff --git a/Makefile b/Makefile
@@ -5,8 +5,6 @@ SHELL=/bin/bash
 
 LIBRARY_BASE_PATH=/workspace/runpod-playground
 PYTHON=python
-DEPLOYED_MODEL_NAME=alpindale/c4ai-command-r-plus-GPTQ
-MAX_CONTEXT_LEN=32000
 
 .PHONY: help install gui
 .DEFAULT_GOAL=help
@@ -30,26 +28,29 @@ install-uv:
 	! command -v uv &> /dev/null && curl -LsSf https://astral.sh/uv/install.sh | sh
 	# echo '. "$$HOME/.cargo/env"' >> ~/.bashrc
 
-install: ## Installs the development version of the package
+install-package: ## Installs the development version of the package
 	$(MAKE) install-uv
 	uv sync --frozen
 
 change-model-env: ## Change the model that is specified in the .env file
-	# sed -i 's/DEPLOYED_MODEL_NAME=alpindale\/WizardLM-2-8x22B/DEPLOYED_MODEL_NAME=CohereForAI\/c4ai-command-r-v01/g' .env
-	sed -i '/DEPLOYED_MODEL_NAME=/d' .env
-	echo "DEPLOYED_MODEL_NAME=${DEPLOYED_MODEL_NAME}" >> .env
+	# sed -i 's/HF_MODEL_NAME=alpindale\/WizardLM-2-8x22B/HF_MODEL_NAME=CohereForAI\/c4ai-command-r-v01/g' .env
+	sed -i '/HF_MODEL_NAME=/d' .env
+	echo "HF_MODEL_NAME=${HF_MODEL_NAME}" >> .env
 
 change-max-context-len-env: ## Change the max context length that is specified in the .env file
 	# sed -i 's/MAX_CONTEXT_LEN=32000/MAX_CONTEXT_LEN=40000/g' .env
 	sed -i '/MAX_CONTEXT_LEN=/d' .env
 	echo "MAX_CONTEXT_LEN=${MAX_CONTEXT_LEN}" >> .env
 
+initial-runpod-install: ## Install necessary tools and packages for Runpod, also install project dependencies
+	nohup bash ${LIBRARY_BASE_PATH}/scripts/initial_install.sh > initial_runpod_install_$(shell date +%Y%m%d_%H%M%S).txt 2>&1 &
+
 download-model: ## Download the model that is specified in the .env file
-	nohup bash ${LIBRARY_BASE_PATH}/scripts/download_model.sh > download_model_log.txt 2>&1 &
+	nohup bash ${LIBRARY_BASE_PATH}/scripts/download_model.sh > download_model_log_$(shell date +%Y%m%d_%H%M%S).txt 2>&1 &
 
 start-vllm: ## Start the VLLM server
-	nohup bash ${LIBRARY_BASE_PATH}/scripts/start_vllm.sh > vllm_log.txt 2>&1 &
-	nohup bash ${LIBRARY_BASE_PATH}/scripts/run_preodically_basic.sh ${LIBRARY_BASE_PATH}/scripts/healthcheck_model_api.sh > healthcheck_periodically.txt 300 2>&1 &
+	nohup bash ${LIBRARY_BASE_PATH}/scripts/start_vllm.sh > vllm_log_$(shell date +%Y%m%d_%H%M%S).txt 2>&1 &
+	nohup bash ${LIBRARY_BASE_PATH}/scripts/run_preodically_basic.sh ${LIBRARY_BASE_PATH}/scripts/healthcheck_model_api.sh > healthcheck_periodically_$(shell date +%Y%m%d_%H%M%S).txt 300 2>&1 &
 
 stop-vllm: ## Stop the VLLM server
 	pkill -f 'run_preodically_basic|vllm.entrypoints'
@@ -58,8 +59,9 @@ restart-vllm: ## Stops and starts the VLLM server
 	$(MAKE) stop-vllm
 	$(MAKE) start-vllm
 
-log-vllm: ## Show the log of the VLLM server
-	tail -f -n 100 vllm_log.txt
+log-vllm: ## Show the log of the VLLM server, only the last log file
+	@last_log_file=$(shell ls -t vllm_log_*.txt | head -n 1); \
+	tail -f -n 100 $$last_log_file
 
 send-chat-message: ## Send a chat message to the VLLM server
 	bash ${LIBRARY_BASE_PATH}/scripts/send_api_chat_message.sh send_message_with_system
@@ -72,4 +74,4 @@ gui: ## Start the GUI
 		--server.port 5000 \
 		--server.enableCORS=false \
 		--server.enableXsrfProtection=false \
-		runpod_playground/gui/main.py > streamlit_log.txt 2>&1 &
+		runpod_playground/gui/main.py > streamlit_log_$(shell date +%Y%m%d_%H%M%S).txt 2>&1 &
diff --git a/README.md b/README.md
@@ -31,12 +31,13 @@ make restart-vllm
 make gui
 ```
 
-- To deploy different model, in `.env` file, change `DEPLOYED_MODEL_NAME` variable to the model name you want to deploy by following hunggingface repository id convention.
+- To deploy different model, in `.env` file, change `HF_MODEL_NAME` variable to the model name you want to deploy by following hunggingface repository id convention.
+- Also you can change `SERVED_MODEL_NAME` to specify model name for requests.
 - One can also change `MAX_CONTEXT_LEN` variable to the desired context length.
-- Example: Change default model and its context length to CohereForAI/c4ai-command-r-v01
+- Example: Change default model and its context length to CohereForAI/c4ai-command-r-plus-GPTQ
 
 ```bash
-make change-model-env DEPLOYED_MODEL_NAME=CohereForAI/c4ai-command-r-v01
+make change-model-env HF_MODEL_NAME=CohereForAI/c4ai-command-r-plus-GPTQ
 make change-max-context-len-env MAX_CONTEXT_LEN=40000
 
 ```

diff --git a/notebooks/vllm_playground.ipynb b/notebooks/vllm_playground.ipynb
@@ -8,12 +8,12 @@
     "\n",
     "```bash\n",
     "# Download the model\n",
-    "huggingface-cli download $DEPLOYED_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME --include \"Codestral-22B-v0.1-Q8_0.gguf\"\n",
-    "huggingface-cli download $DEPLOYED_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME --include \"Codestral-22B-v0.1.imatrix\"\n",
+    "huggingface-cli download $HF_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME --include \"Codestral-22B-v0.1-Q8_0.gguf\"\n",
+    "huggingface-cli download $HF_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME --include \"Codestral-22B-v0.1.imatrix\"\n",
     "\n",
     "\n",
     "# Start the server\n",
-    "SERVED_MODEL_NAME=\"${DEPLOYED_MODEL_NAME#*/}\"\n",
+    "SERVED_MODEL_NAME=\"${HF_MODEL_NAME#*/}\"\n",
     "MODEL_PATH=$LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME/Codestral-22B-v0.1-Q8_0.gguf\n",
     "GPU_COUNT=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits | head -n 1)\n",
     "\n",

diff --git a/runpod_playground/benchmarks/__init__.py b/runpod_playground/benchmarks/__init__.py
diff --git a/runpod_playground/benchmarks/load_test.py b/runpod_playground/benchmarks/load_test.py
diff --git a/runpod_playground/download_model.py b/runpod_playground/download_model.py
@@ -29,13 +29,13 @@ def download_model_hf(
 if __name__ == "__main__":
     load_dotenv()
 
-    DEPLOYED_MODEL_NAME = os.getenv("DEPLOYED_MODEL_NAME", "alpindale/WizardLM-2-8x22B")
+    HF_MODEL_NAME = os.getenv("HF_MODEL_NAME", "alpindale/c4ai-command-r-plus-GPTQ")
     revision = "main"
     # revision = "6.0bpw"
 
     # ignore_patterns = ["*.pt"]
     ignore_patterns = ["*.pt", "*.bin"]
 
     download_model_hf(
-        repo_id=DEPLOYED_MODEL_NAME, revision=revision, ignore_patterns=ignore_patterns
+        repo_id=HF_MODEL_NAME, revision=revision, ignore_patterns=ignore_patterns
     )
diff --git a/runpod_playground/gui/main.py b/runpod_playground/gui/main.py
@@ -3,17 +3,16 @@
 import streamlit as st
 from openai import OpenAI
 
-DEPLOYED_MODEL_NAME = os.getenv("DEPLOYED_MODEL_NAME", None)
+HF_MODEL_NAME = os.getenv("HF_MODEL_NAME", None)
+SERVED_MODEL_NAME = os.getenv("SERVED_MODEL_NAME", None)
 API_ENDPOINT = os.getenv("API_ENDPOINT", None)
 
-if DEPLOYED_MODEL_NAME is None or API_ENDPOINT is None:
+if HF_MODEL_NAME is None or SERVED_MODEL_NAME is None or API_ENDPOINT is None:
     st.error(
-        "Please set the DEPLOYED_MODEL_NAME and API_ENDPOINT environment variables."
+        "Please set the HF_MODEL_NAME, SERVED_MODEL_NAME and API_ENDPOINT environment variables."
     )
     st.stop()
 
-SERVED_MODEL_NAME = DEPLOYED_MODEL_NAME.split("/")[-1]
-
 st.title("VLLM Server Test")
 
 client = OpenAI(api_key="NONE", base_url=f"{API_ENDPOINT}/v1")

diff --git a/scripts/download_model.sh b/scripts/download_model.sh
@@ -13,7 +13,7 @@ source $HOME/.cargo/env bash
 source $LIBRARY_BASE_PATH/.venv/bin/activate
 
 # Download model if not already present
-SERVED_MODEL_NAME="${DEPLOYED_MODEL_NAME#*/}"
+# SERVED_MODEL_NAME="${HF_MODEL_NAME#*/}"
 
 if [ ! $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME ]; then
     echo "******** $SERVED_MODEL_NAME already downloaded ********"
@@ -26,4 +26,4 @@ if ! uv pip show hf_transfer >/dev/null 2>&1; then
 fi
 
 echo "******** Downloading model ********"
-huggingface-cli download $DEPLOYED_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME
+huggingface-cli download $HF_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME
diff --git a/scripts/healthcheck_model_api.sh b/scripts/healthcheck_model_api.sh
@@ -26,14 +26,18 @@ RESPONSE=$(send_guided_regex_message)
 
 # Check if the curl command timed out
 if echo "$RESPONSE" | grep -q "Request timed out."; then
-    pkill -f vllm.entrypoints
-    nohup bash ${LIBRARY_BASE_PATH}/scripts/start_vllm.sh > vllm_log.txt 2>&1 &
+    if [ "$ENABLE_AUTO_RESTART" = "1" ]; then
+        pkill -f vllm.entrypoints
+        nohup bash ${LIBRARY_BASE_PATH}/scripts/start_vllm.sh > vllm_log.txt 2>&1 &
+    fi
 
     # Send message to Teams Chat
     MESSAGE="Request timed out. Hence, the model api is restarted."
     TITLE="${TEAMS_MESSAGE_TITLE} - POD FAILURE"
 
-    send_teams_message "$TEAMS_WEBHOOK_URL" "$MESSAGE" "$TITLE"
+    if [ "$ENABLE_TEAMS_NOTIFICATION" = "1" ]; then
+        send_teams_message "$TEAMS_WEBHOOK_URL" "$MESSAGE" "$TITLE"
+    fi
 
     exit 1
 fi
@@ -46,11 +50,15 @@ fi
 
 echo "API response did not contain '200 OK'."
 
-pkill -f vllm.entrypoints
-nohup bash ${LIBRARY_BASE_PATH}/scripts/start_vllm.sh > vllm_log.txt 2>&1 &
+if [ "$ENABLE_AUTO_RESTART" = "1" ]; then
+    pkill -f vllm.entrypoints
+    nohup bash ${LIBRARY_BASE_PATH}/scripts/start_vllm.sh > vllm_log.txt 2>&1 &
+fi
 
 # Send message to Teams Chat
 MESSAGE="The model didn't correctly respond. Hence, the model api is restarted."
 TITLE="${TEAMS_MESSAGE_TITLE} - POD FAILURE"
 
-send_teams_message "$TEAMS_WEBHOOK_URL" "$MESSAGE" "$TITLE"
+if [ "$ENABLE_TEAMS_NOTIFICATION" = "1" ]; then
+    send_teams_message "$TEAMS_WEBHOOK_URL" "$MESSAGE" "$TITLE"
+fi
diff --git a/scripts/run_preodically_basic.sh b/scripts/run_preodically_basic.sh
@@ -3,6 +3,11 @@
 # Load .env file
 source $(dirname "$(realpath "$0")")/../.env
 
+if [ "$ENABLE_HEALTH_CHECK" = "0" ]; then
+    echo "HEALTH CHECK IS DISABLED."
+    exit 1
+fi
+
 # Define the full path to your script
 SCRIPT_PATH="$1"
 

diff --git a/scripts/send_api_chat_message.sh b/scripts/send_api_chat_message.sh
@@ -3,8 +3,6 @@
 # Load .env file
 source $(dirname "$(realpath "$0")")/../.env
 
-SERVED_MODEL_NAME="${DEPLOYED_MODEL_NAME#*/}"
-
 send_health_check_message() {
     local response=$(curl -s -o /dev/null -w "%{http_code}" "$API_ENDPOINT/health")
 

diff --git a/scripts/start_vllm.sh b/scripts/start_vllm.sh
@@ -20,7 +20,6 @@ source $HOME/.cargo/env bash
 source $LIBRARY_BASE_PATH/.venv/bin/activate
 
 # Download model if not already present
-SERVED_MODEL_NAME="${DEPLOYED_MODEL_NAME#*/}"
 if [ ! -d $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME ]; then
     bash $LIBRARY_BASE_PATH/scripts/download_model.sh
 fi