Skip to content
This repository was archived by the owner on Dec 14, 2024. It is now read-only.

Commit 66d58f6

Browse files
committed
Added:
- `llama.cpp` supports: `flash_attn`, `grp_attn_n`, `grp_attn_w`
1 parent 851d88c commit 66d58f6

File tree

5 files changed

+19
-107
lines changed

5 files changed

+19
-107
lines changed

CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# CHANGELOG
22

3+
## v0.1.55
4+
5+
Added:
6+
- `llama.cpp` supports: `flash_attn`, `grp_attn_n`, `grp_attn_w`
7+
38
## v0.1.54
49

510
Fixed:

README.md

-106
Original file line numberDiff line numberDiff line change
@@ -147,111 +147,6 @@ pip install torch torchvision torchaudio --index-url https://download.pytorch.or
147147
poetry install
148148
```
149149

150-
Download one of popular models to try them:
151-
152-
```bash
153-
# NOTE: login in case you need to accept terms and conditions for some models
154-
# huggingface-cli login
155-
156-
# command-r
157-
huggingface-cli download mradermacher/c4ai-command-r-plus-i1-GGUF c4ai-command-r-plus.i1-IQ1_S.gguf
158-
huggingface-cli download nold/c4ai-command-r-v01-GGUF c4ai-command-r-v01_Q3_K_M.gguf
159-
huggingface-cli download nold/c4ai-command-r-v01-GGUF c4ai-command-r-v01_Q2_K.gguf
160-
161-
# xverse
162-
huggingface-cli download xverse/XVERSE-7B-Chat-GGUF xverse-7b-chat-q4_k_m.gguf
163-
huggingface-cli download xverse/XVERSE-13B-Chat-GGUF xverse-13b-chat-q4_k_m.gguf
164-
165-
# internlm2
166-
huggingface-cli download nold/internlm2-chat-20b-GGUF internlm2-chat-20b_Q3_K_M.gguf
167-
huggingface-cli download nold/internlm2-chat-20b-GGUF internlm2-chat-20b_Q4_K_M.gguf
168-
huggingface-cli download izumi04/InternLM2-Chat-7B-GGUF internlm2-chat-7b-Q3_K_M.gguf
169-
huggingface-cli download izumi04/InternLM2-Chat-7B-GGUF internlm2-chat-7b-Q4_K_M.gguf
170-
171-
# yi
172-
huggingface-cli download LoneStriker/Yi-9B-200K-GGUF Yi-9B-200K-Q4_K_M.gguf
173-
huggingface-cli download LoneStriker/Yi-6B-200K-GGUF Yi-6B-200K-Q4_K_M.gguf
174-
175-
# gemma
176-
huggingface-cli download pabloce/dolphin-2.8-gemma-2b-GGUF dolphin-2.8-gemma-2b.Q4_K_M.gguf
177-
huggingface-cli download bartowski/gemma-1.1-7b-it-GGUF gemma-1.1-7b-it-Q4_K_M.gguf
178-
huggingface-cli download bartowski/gemma-1.1-2b-it-GGUF gemma-1.1-2b-it-Q4_K_M.gguf
179-
180-
# qwen
181-
huggingface-cli download qwp4w3hyb/Qwen1.5-14B-Chat-iMat-GGUF qwen1.5-14b-chat-imat-IQ1_S.gguf
182-
huggingface-cli download qwp4w3hyb/Qwen1.5-14B-Chat-iMat-GGUF qwen1.5-14b-chat-imat-IQ2_XS.gguf
183-
huggingface-cli download qwp4w3hyb/Qwen1.5-14B-Chat-iMat-GGUF qwen1.5-14b-chat-imat-IQ2_S.gguf
184-
huggingface-cli download qwp4w3hyb/Qwen1.5-14B-Chat-iMat-GGUF qwen1.5-14b-chat-imat-IQ2_M.gguf
185-
huggingface-cli download qwp4w3hyb/Qwen1.5-14B-Chat-iMat-GGUF qwen1.5-14b-chat-imat-IQ3_M.gguf
186-
huggingface-cli download Qwen/Qwen1.5-14B-Chat-GGUF qwen1_5-14b-chat-q2_k.gguf
187-
huggingface-cli download Qwen/Qwen1.5-14B-Chat-GGUF qwen1_5-14b-chat-q3_k_m.gguf
188-
huggingface-cli download Qwen/Qwen1.5-14B-Chat-GGUF qwen1_5-14b-chat-q4_k_m.gguf
189-
huggingface-cli download Qwen/Qwen1.5-7B-Chat-GGUF qwen1_5-7b-chat-q4_k_m.gguf
190-
huggingface-cli download Qwen/Qwen1.5-4B-Chat-GGUF qwen1_5-4b-chat-q4_k_m.gguf
191-
huggingface-cli download Qwen/Qwen1.5-1.8B-Chat-GGUF qwen1_5-1_8b-chat-q4_k_m.gguf
192-
huggingface-cli download Qwen/Qwen1.5-0.5B-Chat-GGUF qwen1_5-0_5b-chat-q4_k_m.gguf
193-
194-
# mistral ai
195-
huggingface-cli download bartowski/Mistral-22B-v0.2-GGUF Mistral-22B-v0.2-IQ2_M.gguf
196-
huggingface-cli download bartowski/Mistral-22B-v0.2-GGUF Mistral-22B-v0.2-Q4_K_M.gguf
197-
huggingface-cli download TheBloke/dolphin-2.7-mixtral-8x7b-GGUF dolphin-2.7-mixtral-8x7b.Q3_K_M.gguf
198-
huggingface-cli download mradermacher/Mixtral-8x7B-Instruct-v0.1-i1-GGUF Mixtral-8x7B-Instruct-v0.1.i1-IQ1_S.gguf
199-
huggingface-cli download mradermacher/Mixtral-8x7B-Instruct-v0.1-i1-GGUF Mixtral-8x7B-Instruct-v0.1.i1-IQ2_XXS.gguf
200-
huggingface-cli download mradermacher/Mixtral-8x7B-Instruct-v0.1-i1-GGUF Mixtral-8x7B-Instruct-v0.1.i1-IQ2_M.gguf
201-
huggingface-cli download mradermacher/Mixtral-8x7B-Instruct-v0.1-i1-GGUF Mixtral-8x7B-Instruct-v0.1.i1-Q3_K_M.gguf
202-
huggingface-cli download bartowski/dolphin-2.8-mistral-7b-v02-GGUF dolphin-2.8-mistral-7b-v02-Q4_K_M.gguf
203-
huggingface-cli download TheBloke/dolphin-2.6-mistral-7B-GGUF dolphin-2.6-mistral-7b.Q4_K_M.gguf
204-
huggingface-cli download NousResearch/Hermes-2-Pro-Mistral-7B-GGUF Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
205-
huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf
206-
207-
# stability ai
208-
huggingface-cli download lmz/candle-stablelm
209-
huggingface-cli download stabilityai/stablelm-2-12b-chat-GGUF stablelm-2-12b-chat-Q4_K_M.gguf
210-
huggingface-cli download brittlewis12/stablelm-2-1_6b-chat-GGUF stablelm-2-1_6b-chat.Q8_0.gguf
211-
huggingface-cli download stabilityai/stablelm-2-zephyr-1_6b stablelm-2-zephyr-1_6b-Q4_1.gguf
212-
huggingface-cli download stabilityai/stablelm-2-zephyr-1_6b stablelm-2-zephyr-1_6b-Q8_0.gguf
213-
huggingface-cli download TheBloke/stablelm-zephyr-3b-GGUF stablelm-zephyr-3b.Q4_K_M.gguf
214-
huggingface-cli download TheBloke/stable-code-3b-GGUF stable-code-3b.Q4_K_M.gguf
215-
216-
# technology innovation institute (tii)
217-
huggingface-cli download mradermacher/falcon-40b-instruct-GGUF falcon-40b-instruct.IQ3_XS.gguf
218-
huggingface-cli download maddes8cht/tiiuae-falcon-7b-instruct-gguf tiiuae-falcon-7b-instruct-Q4_K_M.gguf
219-
220-
# meta llama
221-
huggingface-cli download NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
222-
huggingface-cli download bartowski/dolphin-2.9-llama3-8b-1m-GGUF dolphin-2.9-llama3-8b-1m-IQ1_S.gguf
223-
huggingface-cli download PrunaAI/dolphin-2.9-llama3-8b-1m-GGUF-smashed dolphin-2.9-llama3-8b-1m.Q4_K_M.gguf
224-
huggingface-cli download PrunaAI/dolphin-2.9-llama3-8b-256k-GGUF-smashed dolphin-2.9-llama3-8b-256k.IQ3_XS.gguf
225-
huggingface-cli download PrunaAI/dolphin-2.9-llama3-8b-256k-GGUF-smashed dolphin-2.9-llama3-8b-256k.Q4_K_M.gguf
226-
huggingface-cli download mradermacher/Meta-Llama-3-8B-Instruct-64k-GGUF Meta-Llama-3-8B-Instruct-64k.Q4_K_M.gguf
227-
huggingface-cli download cognitivecomputations/dolphin-2.9-llama3-8b-gguf dolphin-2.9-llama3-8b-q4_K_M.gguf
228-
huggingface-cli download cognitivecomputations/dolphin-2.9-llama3-8b-gguf dolphin-2.9-llama3-8b-q8_0.gguf
229-
huggingface-cli download mradermacher/Meta-Llama-3-8B-Instruct-i1-GGUF Meta-Llama-3-8B-Instruct.i1-Q4_K_M.gguf
230-
huggingface-cli download mradermacher/Meta-Llama-3-8B-Instruct-i1-GGUF Meta-Llama-3-8B-Instruct.i1-IQ4_XS.gguf
231-
huggingface-cli download TheBloke/Orca-2-7B-GGUF orca-2-7b.Q4_K_M.gguf
232-
huggingface-cli download afrideva/MiniChat-2-3B-GGUF minichat-2-3b.q4_k_m.gguf
233-
huggingface-cli download azarovalex/MobileLLaMA-1.4B-Chat-GGUF MobileLLaMA-1.4B-Chat-Q4_K.gguf
234-
huggingface-cli download TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
235-
huggingface-cli download TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF tinyllama-1.1b-chat-v1.0.Q8_0.gguf
236-
huggingface-cli download s3nh/TinyDolphin-2.8-1.1b-GGUF tinydolphin-2.8-1.1b.Q4_K_M.gguf
237-
huggingface-cli download s3nh/TinyDolphin-2.8-1.1b-GGUF tinydolphin-2.8-1.1b.Q8_0.gguf
238-
huggingface-cli download thephimart/tinyllama-4x1.1b-moe.Q5_K_M.gguf tinyllama-4x1.1b-moe.Q5_K_M.gguf
239-
240-
# microsoft phi
241-
huggingface-cli download lmz/candle-quantized-phi
242-
huggingface-cli download PrunaAI/Phi-3-mini-128k-instruct-GGUF-Imatrix-smashed Phi-3-mini-128k-instruct.IQ2_XXS.gguf
243-
huggingface-cli download PrunaAI/Phi-3-mini-128k-instruct-GGUF-Imatrix-smashed Phi-3-mini-128k-instruct.Q4_K_M.gguf
244-
huggingface-cli download PrunaAI/Phi-3-mini-128k-instruct-GGUF-Imatrix-smashed Phi-3-mini-128k-instruct.Q5_K_M.gguf
245-
huggingface-cli download QuantFactory/Phi-3-mini-128k-instruct-GGUF Phi-3-mini-128k-instruct.Q4_K_M.gguf
246-
huggingface-cli download QuantFactory/Phi-3-mini-128k-instruct-GGUF Phi-3-mini-128k-instruct.Q8_0.gguf
247-
huggingface-cli download microsoft/Phi-3-mini-4k-instruct-gguf Phi-3-mini-4k-instruct-fp16.gguf
248-
huggingface-cli download microsoft/Phi-3-mini-4k-instruct-gguf Phi-3-mini-4k-instruct-q4.gguf
249-
huggingface-cli download TheBloke/dolphin-2_6-phi-2-GGUF dolphin-2_6-phi-2.Q4_K_M.gguf
250-
huggingface-cli download MaziyarPanahi/phi-2-super-GGUF phi-2-super.Q4_K_M.gguf
251-
huggingface-cli download TheBloke/phi-2-GGUF phi-2.Q4_K_M.gguf
252-
huggingface-cli download TKDKid1000/phi-1_5-GGUF phi-1_5-Q4_K_M.gguf
253-
```
254-
255150
Run server:
256151

257152
```bash
@@ -286,7 +181,6 @@ openssl req -x509 -nodes -newkey rsa:4096 -keyout key.pem -out cert.pem -days 36
286181
```
287182

288183

289-
290184
## Run
291185

292186
```bash

mli/params.py

+3
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ class LlamaCppParams(TypedDict):
3434
file: Optional[str] # / prompt file (path) to start generation
3535
image: Optional[str] # path to an image file. use with multimodal models
3636
no_display_prompt: Optional[bool] # True
37+
grp_attn_n: Optional[int] # group-attention factor (default: 1)
38+
grp_attn_w: Optional[float] # group-attention width (default: 512.0)
3739
split_mode: Optional[str] # 'none', 'layer' (default), 'row'
3840
tensor_split: Optional[str] # None, e.g. '3,1'
3941
main_gpu: Optional[int] # None, e.g. 0 (default)
@@ -48,6 +50,7 @@ class LlamaCppParams(TypedDict):
4850
rope_freq_base: Optional[int | float] # RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
4951
rope_freq_scale: Optional[int | float] # RoPE frequency scaling factor, expands context by a factor of 1/N
5052
cont_batching: Optional[bool] # enable continuous batching (a.k.a dynamic batching) (default: disabled)
53+
flash_attn: Optional[bool] # enable Flash Attention (default: disabled)
5154
prompt_to_file: Optional[bool] # save prompt to file
5255
image_to_file: Optional[bool] # base64 encoded image to be saved to file
5356

mli/server.py

+10
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ def _format_llama_cpp_cmd(self, kwargs: LlamaCppParams) -> str:
106106
repeat_penalty: float = float(kwargs.get('repeat_penalty', 1.0))
107107
penalize_nl: bool | None = kwargs.get('penalize_nl')
108108
no_display_prompt: float = float(kwargs.get('no_display_prompt', True))
109+
grp_attn_n: int = int(kwargs.get('grp_attn_n', 1))
110+
grp_attn_w: float = float(kwargs.get('grp_attn_w', 512.0))
109111
split_mode: str | None = kwargs.get('split_mode')
110112
tensor_split: str | None = kwargs.get('tensor_split')
111113
main_gpu: int | None = kwargs.get('main_gpu')
@@ -120,6 +122,7 @@ def _format_llama_cpp_cmd(self, kwargs: LlamaCppParams) -> str:
120122
rope_freq_base: int | float | None = kwargs.get('rope_freq_base')
121123
rope_freq_scale: int | float | None = kwargs.get('rope_freq_scale')
122124
cont_batching: bool | None = kwargs.get('cont_batching', False)
125+
flash_attn: bool | None = kwargs.get('flash_attn', False)
123126
prompt_to_file: bool = kwargs.get('prompt_to_file', False)
124127
image_to_file: bool = kwargs.get('image_to_file', False)
125128

@@ -228,6 +231,11 @@ def _format_llama_cpp_cmd(self, kwargs: LlamaCppParams) -> str:
228231
'--cont-batching',
229232
])
230233

234+
if flash_attn is not None:
235+
cmd.extend([
236+
'--flash-attn',
237+
])
238+
231239
if prompt and not prompt_to_file:
232240
shell_prompt: str = shlex.quote(prompt)
233241

@@ -295,6 +303,8 @@ def _format_llama_cpp_cmd(self, kwargs: LlamaCppParams) -> str:
295303
'--repeat-last-n', repeat_last_n,
296304
'--repeat-penalty', repeat_penalty,
297305
'--keep', keep,
306+
'--grp-attn-n', grp_attn_n,
307+
'--grp-attn-w', grp_attn_w,
298308
'--simple-io',
299309
'--log-disable',
300310
])

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "mlipy"
3-
version = "0.1.54"
3+
version = "0.1.55"
44
description = "Python-based Machine Learning Interface"
55
homepage = "https://github.com/tangledgroup/mlipy"
66
repository = "https://github.com/tangledgroup/mlipy"

0 commit comments

Comments
 (0)