initial commit

ilkersigirci · Mar 19, 2024 · 4846e66 · 4846e66
1 parent 5f9b271
commit 4846e66
Show file tree

Hide file tree

Showing 17 changed files with 406 additions and 0 deletions.
diff --git a/download_model.py b/download_model.py
@@ -0,0 +1,45 @@
+"""
+pip install python-dotenv huggingface_hub
+
+HF_HOME=/workspace/huggingface python download_model.py
+"""
+
+import os
+
+from dotenv import load_dotenv
+from huggingface_hub import snapshot_download
+
+
+def main():
+    repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+    # repo_id = "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"
+    # repo_id = "turboderp/Mixtral-8x7B-instruct-exl2"
+    # repo_id = "turboderp/TinyLlama-1B-exl2"
+    # repo_id = "CohereForAI/aya-101"
+    # repo_id = "wolfram/miquliz-120b-v2.0-5.0bpw-h6-exl2"
+    # repo_id = "Trendyol/Trendyol-LLM-7b-base-v0.1"
+    # repo_id = "teknium/OpenHermes-2.5-Mistral-7B"
+    # repo_id = "sambanovasystems/SambaLingo-Turkish-Chat"
+
+    local_dir_name = repo_id.split("/")[1]
+
+    token = os.getenv("HF_TOKEN", None)
+
+    # hf_hub_download(repo_id=repo_id, filename="config.json", revision="8.0bpw")
+
+    # NOTE: First downloads to cache and then copies to local_dir
+    snapshot_download(
+        repo_id=repo_id,
+        revision="main",
+        local_dir=f"./models/{local_dir_name}",
+        local_dir_use_symlinks=False,
+        # ignore_patterns=["*.pt"],
+        ignore_patterns=["*.pt", "*.bin"],
+        token=token,
+    )
+
+
+if __name__ == "__main__":
+    load_dotenv()
+
+    main()
diff --git a/huggingface/.gitkeep b/huggingface/.gitkeep
diff --git a/models/.gitkeep b/models/.gitkeep
diff --git a/scripts/initial_install.sh b/scripts/initial_install.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+if ! command -v nano >/dev/null 2>&1; then
+    apt update -y && apt install nano htop nvtop ncdu -y
+fi
diff --git a/scripts/start_aphrodite.sh b/scripts/start_aphrodite.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+if ! command -v nano >/dev/null 2>&1; then
+    apt update -y && apt install nano htop nvtop ncdu -y
+fi
+
+if pip show torch | grep -q "Version: 2.1.1"; then
+    pip uninstall torch torchaudio torchvision -y
+fi
+
+if ! pip show aphrodite-engine >/dev/null 2>&1; then
+    pip install aphrodite-engine
+fi
+
+# python -m aphrodite.endpoints.openai.api_server --help
+MODEL=/workspace/models/Mixtral-8x7B-Instruct-v0.1 && \
+MODEL_NAME=Mixtral-8x7B-Instruct-v0.1 && \
+HF_HOME=/workspace/huggingface \
+    python -m aphrodite.endpoints.openai.api_server \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --tensor-parallel-size 2 \
+    --gpu-memory-utilization 0.9 \
+    --model $MODEL \
+    --served-model-name $MODEL_NAME
diff --git a/scripts/start_tabbyapi.sh b/scripts/start_tabbyapi.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+if ! command -v nano >/dev/null 2>&1; then
+    apt update -y && apt install nano htop nvtop ncdu -y
+fi
+
+if pip show torch | grep -q "Version: 2.1.1"; then
+    pip uninstall torch torchaudio torchvision -y
+fi
+
+# Symbolic link models
+for file in /workspace/models/*; do
+    ln -s "$file" "/workspace/tabbyAPI/official-repo/models/$(basename "$file")"
+done
+
+if ! pip show exllamav2 >/dev/null 2>&1; then
+    pip install -r /workspace/tabbyAPI/official-repo/requirements.txt
+fi
+
+cd /workspace/tabbyAPI/official-repo && python3 main.py
diff --git a/scripts/start_vllm.sh b/scripts/start_vllm.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+if ! command -v nano >/dev/null 2>&1; then
+    apt update -y && apt install nano htop nvtop ncdu -y
+fi
+
+if pip show torch | grep -q "Version: 2.1.1"; then
+    pip uninstall torchaudio torchvision -y
+fi
+
+if ! pip show vllm >/dev/null 2>&1; then
+    pip install vllm==0.3.3
+fi
+
+# --tensor-parallel-size 2
+MODEL=/workspace/models/Mixtral-8x7B-Instruct-v0.1 && \
+MODEL_NAME=Mixtral-8x7B-Instruct-v0.1 && \
+# MODEL=/workspace/models/Trendyol-LLM-7b-chat-v0.1 && \
+# MODEL_NAME=Trendyol && \
+# MODEL=/workspace/models/aya-101&& \
+# MODEL_NAME=Aya && \
+HF_HOME=/workspace/huggingface \
+    python -m vllm.entrypoints.openai.api_server \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --tensor-parallel-size 2 \
+    --gpu-memory-utilization 0.9 \
+    --model $MODEL \
+    --served-model-name $MODEL_NAME
diff --git a/tabbyAPI/config.yml b/tabbyAPI/config.yml
@@ -0,0 +1,141 @@
+# Sample YAML file for configuration.
+# Comment and uncomment values as needed. Every value has a default within the application.
+# This file serves to be a drop in for config.yml
+
+# Unless specified in the comments, DO NOT put these options in quotes!
+# You can use https://www.yamllint.com/ if you want to check your YAML formatting.
+
+# Options for networking
+network:
+  # The IP to host on (default: 127.0.0.1).
+  # Use 0.0.0.0 to expose on all network adapters
+  host: 0.0.0.0
+
+  # The port to host on (default: 5000)
+  port: 8000
+  # port: 8080
+
+  # Disable HTTP token authenticaion with requests
+  # WARNING: This will make your instance vulnerable!
+  # Turn on this option if you are ONLY connecting from localhost
+  disable_auth: True
+
+# Options for logging
+logging:
+  # Enable prompt logging (default: False)
+  prompt: False
+
+  # Enable generation parameter logging (default: False)
+  generation_params: False
+
+# Options for sampling
+sampling:
+  # Override preset name. Find this in the sampler-overrides folder (default: None)
+  # This overrides default fallbacks for sampler values that are passed to the API
+  # Server-side overrides are NOT needed by default
+  # WARNING: Using this can result in a generation speed penalty
+  override_preset: sample_overrides
+
+# Options for model overrides and loading
+model:
+  # Overrides the directory to look for models (default: models)
+  # Windows users, DO NOT put this path in quotes! This directory will be invalid otherwise.
+  model_dir: models
+
+  # An initial model to load. Make sure the model is located in the model directory!
+  # A model can be loaded later via the API.
+  # REQUIRED: This must be filled out to load a model on startup!
+  # model_name:
+  # model_name: TinyLlama-1B-exl2
+  # model_name: Mixtral-8x7B-instruct-exl2
+  # model_name: Mixtral-8x7B-Instruct-v0.1
+  model_name: Trendyol-LLM-7b-chat-v0.1
+  # model_name: miquliz-120b-v2.0-5.0bpw-h6-exl2
+  # model_name: aya-101
+
+  # Sends dummy model names when the models endpoint is queried
+  # Enable this if the program is looking for a specific OAI model
+  #use_dummy_models: False
+
+  # The below parameters apply only if model_name is set
+
+  # Max sequence length (default: Empty)
+  # Fetched from the model's base sequence length in config.json by default
+  #max_seq_len:
+
+  # Overrides base model context length (default: Empty)
+  # WARNING: Don't set this unless you know what you're doing!
+  # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral/Mixtral models)
+  # override_base_seq_len: 30000 # TODO: What is this ??
+
+  # Automatically allocate resources to GPUs (default: True)
+  # gpu_split_auto: False
+
+  # An integer array of GBs of vram to split between GPUs (default: [])
+  # gpu_split: [50, 50]
+
+  # Rope scale (default: 1.0)
+  # Same thing as compress_pos_emb
+  # Only use if your model was trained on long context with rope (check config.json)
+  # Leave blank to pull the value from the model
+  #rope_scale: 1.0
+
+  # Rope alpha (default: 1.0)
+  # Same thing as alpha_value
+  # Leave blank to automatically calculate alpha
+  #rope_alpha: 1.0
+
+  # Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False)
+  #no_flash_attention: False
+
+  # Enable 8 bit cache mode for VRAM savings (slight performance hit). Possible values FP16, FP8. (default: FP16)
+  #cache_mode: FP16
+
+  # Set the prompt template for this model. If empty, chat completions will be disabled. (default: Empty)
+  # NOTE: Only works with chat completion message lists!
+  # prompt_template: chatml
+  # prompt_template: zephyr # For tiny llama
+  prompt_template: mixtral
+  # prompt_template: mistral-instruct
+  # prompt_template: mistral-official # NOTE: Not working
+
+  # Number of experts to use PER TOKEN. Fetched from the model's config.json if not specified (default: Empty)
+  # WARNING: Don't set this unless you know what you're doing!
+  # NOTE: For MoE models (ex. Mixtral) only!
+  #num_experts_per_token:
+
+  # Enables CFG support (default: False)
+  # WARNING: This flag disables Flash Attention! (a stopgap fix until it's fixed in upstream)
+  #use_cfg: False
+
+  # Enables fasttensors to possibly increase model loading speeds (default: False)
+  fasttensors: true
+
+  # Options for draft models (speculative decoding). This will use more VRAM!
+  #draft:
+    # Overrides the directory to look for draft (default: models)
+    #draft_model_dir: models
+
+    # An initial draft model to load. Make sure this model is located in the model directory!
+    # A draft model can be loaded later via the API.
+    #draft_model_name: A model name
+
+    # Rope scale for draft models (default: 1.0)
+    # Same thing as compress_pos_emb
+    # Only use if your draft model was trained on long context with rope (check config.json)
+    #draft_rope_scale: 1.0
+
+    # Rope alpha for draft model (default: 1.0)
+    # Same thing as alpha_value
+    # Leave blank to automatically calculate alpha value
+    #draft_rope_alpha: 1.0
+
+  # Options for loras
+  #lora:
+    # Overrides the directory to look for loras (default: loras)
+    #lora_dir: loras
+
+    # List of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed.
+    #loras:
+    #- name: lora1
+    #  scaling: 1.0
diff --git a/tabbyAPI/prompt_templates/0_alpaca.jinja b/tabbyAPI/prompt_templates/0_alpaca.jinja
@@ -0,0 +1,29 @@
+{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
+
+{% for message in messages %}
+{% if message['role'] == 'user' %}
+### Instruction:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% elif message['role'] == 'assistant' %}
+### Response:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% elif message['role'] == 'user_context' %}
+### Input:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
+### Response:
+{% endif %}
diff --git a/tabbyAPI/prompt_templates/0_chatml.jinja b/tabbyAPI/prompt_templates/0_chatml.jinja
@@ -0,0 +1,2 @@
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
diff --git a/tabbyAPI/prompt_templates/chatml-alternative.jinja b/tabbyAPI/prompt_templates/chatml-alternative.jinja
@@ -0,0 +1,15 @@
+{% if messages[0]['role'] == 'system' %}
+    {% set offset = 1 %}
+{% else %}
+    {% set offset = 0 %}
+{% endif %}
+{{ bos_token }}
+{% for message in messages %}
+    {% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {% endif %}
+    {{ '<|im_start|>' + message['role'] + '\n' + message['content'].strip() + '<|im_end|>\n' }}
+{% endfor %}
+{% if add_generation_prompt %}
+    {{ '<|im_start|>assistant\n' }}
+{% endif %}
diff --git a/tabbyAPI/prompt_templates/mistral-instruct.jinja b/tabbyAPI/prompt_templates/mistral-instruct.jinja
@@ -0,0 +1,23 @@
+{% if messages[0]['role'] == 'system' %}
+    {% set loop_messages = messages[1:] %}
+    {% set system_message = messages[0]['content'].strip() + '\n\n' %}
+{% else %}
+    {% set loop_messages = messages %}
+    {% set system_message = '' %}
+{% endif %}
+{{ bos_token }}
+{% for message in loop_messages %}
+    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {% endif %}
+    {% if loop.index0 == 0 %}
+        {% set content = system_message + message['content'] %}
+    {% else %}
+        {% set content = message['content'] %}
+    {% endif %}
+    {% if message['role'] == 'user' %}
+        {{ '[INST] ' + content.strip() + ' [/INST]' }}
+    {% elif message['role'] == 'assistant' %}
+        {{ ' '  + content.strip() + ' ' + eos_token }}
+    {% endif %}
+{% endfor %}
diff --git a/tabbyAPI/prompt_templates/mixtral-official.jinja b/tabbyAPI/prompt_templates/mixtral-official.jinja
@@ -0,0 +1 @@
+{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
diff --git a/tabbyAPI/prompt_templates/mixtral.jinja b/tabbyAPI/prompt_templates/mixtral.jinja
@@ -0,0 +1,15 @@
+{{- bos_token -}}
+{% if bos_token|length > 0 %}
+    {{- ' ' -}}
+{% endif %}
+{% for message in messages %}
+    {% if message['role'] == 'system' %}
+        {{- message['content'] -}}
+    {% elif message['role'] == 'user' %}
+        {{- '[INST] ' + message['content'] + ' [/INST]' -}}
+    {% elif message['role'] == 'assistant' %}
+        {{- message['content'] + eos_token -}}
+    {% else %}
+        {{ raise_exception('Only user, assistant, and system roles are supported!') }}
+    {% endif %}
+{% endfor %}
diff --git a/tabbyAPI/prompt_templates/vicuna.jinja b/tabbyAPI/prompt_templates/vicuna.jinja
@@ -0,0 +1,21 @@
+{% if messages[0]['role'] == 'system' %}
+    {% set loop_messages = messages[1:] %}
+    {% set system_message = messages[0]['content'].strip() + '\n\n' %}
+{% else %}
+    {% set loop_messages = messages %}
+    {% set system_message = '' %}
+{% endif %}
+{{ bos_token + system_message }}
+{% for message in loop_messages %}
+    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {% endif %}
+    {% if message['role'] == 'user' %}
+        {{ 'USER: ' + message['content'].strip() + '\n' }}
+    {% elif message['role'] == 'assistant' %}
+        {{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\n' }}
+    {% endif %}
+{% endfor %}
+{% if add_generation_prompt %}
+    {{ 'ASSISTANT:' }}
+{% endif %}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{% for message in messages %}{{'<\|im_start\|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<\|im_end\|>' + '\n'}}{% endif %}{% endfor %}
		{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<\|im_start\|>assistant\n' }}{% endif %}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}