Skip to content

Commit 087034e

Browse files
authored
support for embedding api (OpenAgentsInc#22)
1 parent 31201aa commit 087034e

24 files changed

+777
-346
lines changed

.github/workflows/main.yml

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
name: Python package
2+
3+
on: [push]
4+
5+
concurrency:
6+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
7+
cancel-in-progress: true
8+
9+
env:
10+
BILLING_URL: ${{ vars.BILLING_URL }}
11+
SECRET_KEY: ${{ vars.SECRET_KEY }}
12+
BYPASS_TOKEN: ${{ vars.BYPASS_TOKEN }}
13+
14+
jobs:
15+
build:
16+
runs-on: ubuntu-latest
17+
strategy:
18+
matrix:
19+
python-version: ["3.11"]
20+
env:
21+
COVERAGE_FILE: ".coverage.${{ matrix.info.os }}.${{ matrix.info.python }}.${{ matrix.info.mysql }}"
22+
steps:
23+
- uses: actions/checkout@v3
24+
- name: Install poetry
25+
run: pipx install poetry
26+
- name: Set up Python ${{ matrix.python-version }}
27+
uses: actions/setup-python@v4
28+
with:
29+
python-version: ${{ matrix.python-version }}
30+
cache: 'poetry'
31+
- name: Install dependencies
32+
run: |
33+
python -m pip install --upgrade pip
34+
pip install ruff pytest poetry coverage
35+
poetry lock --check
36+
poetry install --with=dev-onnx
37+
- name: Lint with ruff
38+
run: |
39+
# stop the build if there are Python syntax errors or undefined names
40+
ruff --output-format=github --target-version=py311 .
41+
- name: Test with pytest
42+
run: |
43+
./run_tests.sh
44+
- uses: actions/upload-artifact@v3
45+
with:
46+
name: coverage
47+
path: |
48+
${{ env.COVERAGE_FILE }}
49+
htmlcov
50+

.gitignore

+3-2
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@ ai_worker/version.py
22

33
# lib stuff
44
CLBlast/
5-
build-cuda/
6-
build-opencl/
5+
build-*/
76
bin/
87
*.gguf
8+
local_cache/
9+
tests/local_cache/
910

1011
# Byte-compiled / optimized / DLL files
1112
__pycache__/

ai_worker/fast_embed.py

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import json
2+
import os
3+
import logging as log
4+
from typing import Optional
5+
6+
DEFAULT_MAX_LENGTH = 512
7+
DEFAULT_MODEL = "BAAI/bge-base-en-v1.5"
8+
MODEL_PREFIX = "fastembed:"
9+
10+
11+
class _FastEmbed:
12+
def __init__(self, cls, conf):
13+
self.conf = conf
14+
self.embedding_class = cls
15+
self.embedding_model = cls(model_name=DEFAULT_MODEL, max_length=DEFAULT_MAX_LENGTH)
16+
17+
def embed(self, req: dict):
18+
model = req["model"]
19+
20+
if model.startswith(MODEL_PREFIX):
21+
model = model[len(MODEL_PREFIX):]
22+
23+
max_length = req.get("max_length", 512)
24+
25+
if self.embedding_model.model_name != model or self.embedding_model._max_length != max_length: # noqa
26+
# swap out model
27+
self.embedding_model = self.embedding_class(model_name=model, max_length=max_length)
28+
29+
docs = req["input"]
30+
if isinstance(docs, str):
31+
docs = [docs]
32+
33+
# todo: better toks count
34+
toks = int(len(json.dumps(docs)) / 2.5)
35+
36+
res = {
37+
"object": "list",
38+
"model": model,
39+
"usage": {
40+
"prompt_tokens": toks,
41+
"total_tokens": toks
42+
}
43+
}
44+
45+
embed = [
46+
dict(
47+
object="embedding",
48+
embedding=nda.tolist(),
49+
index=i
50+
)
51+
for i, nda in enumerate(self.embedding_model.embed(docs, parallel=0))
52+
]
53+
54+
res["data"] = embed
55+
56+
return res
57+
58+
59+
def FastEmbed(*a) -> Optional[_FastEmbed]:
60+
try:
61+
from fastembed.embedding import FlagEmbedding as Embedding
62+
import onnxruntime as ort
63+
if ort.get_device() != "GPU" and not os.environ.get("CI"):
64+
log.warning("fast embed not enabled, ort runtime does not see the GPU")
65+
return None
66+
return _FastEmbed(Embedding, *a)
67+
except ImportError:
68+
if os.environ.get("GPUTOPIA_DEBUG_IMPORT"):
69+
log.exception("fast embed not enabled")
70+
return None

ai_worker/fine_tune.py

+1-11
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
import gc
2-
import hashlib
32
import json
43
import asyncio
54
import threading
65
import logging
76
import os
87
import random
9-
import tarfile
108
import shutil
119

1210
import transformers
@@ -17,7 +15,7 @@
1715
from peft import prepare_model_for_kbit_training, PeftModel, LoraConfig, get_peft_model
1816
from accelerate import FullyShardedDataParallelPlugin, Accelerator
1917
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
20-
from ai_worker.util import quantize_gguf, url_to_tempfile, user_ft_name_to_url
18+
from ai_worker.util import quantize_gguf, url_to_tempfile, user_ft_name_to_url, gzip
2119

2220
from ai_worker.util import b64enc
2321
from gguf_loader.convert import main as gguf_main
@@ -27,14 +25,6 @@
2725
log = logging.getLogger(__name__)
2826

2927

30-
def gzip(folder):
31-
"""tar gz the folder to 'folder.tar.gz', removes the folder"""
32-
base_folder_name = os.path.basename(folder)
33-
with tarfile.open(f"{folder}.tar.gz", 'w:gz') as archive:
34-
archive.add(folder, arcname=base_folder_name)
35-
return f"{folder}.tar.gz"
36-
37-
3828
class FineTuner:
3929
def __init__(self, conf):
4030
self.conf = conf

ai_worker/gguf_reader.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,8 @@ def read_header(self):
131131

132132
self.gguf_version = struct.unpack("<I", self.fin.read(4))[0]
133133

134-
if self.gguf_version != 2:
135-
raise ValueError("Can only summarize version 2 files")
134+
if self.gguf_version < 2:
135+
raise ValueError("Can only summarize version 2/3 files, got version %s" % self.gguf_version)
136136

137137
self.ti_data_count = struct.unpack("<Q", self.fin.read(8))[0]
138138
self.kv_data_count = struct.unpack("<Q", self.fin.read(8))[0]

ai_worker/jsonlines.py

-10
This file was deleted.

ai_worker/main.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,13 @@
3232

3333
try:
3434
from .fine_tune import FineTuner
35-
except ImportError as ex:
35+
except ImportError:
3636
if os.environ.get("GPUTOPIA_DEBUG_IMPORT"):
3737
log.exception("fine tuning not enabled")
3838
FineTuner = None
3939

40+
from .fast_embed import FastEmbed, MODEL_PREFIX
41+
4042
from gguf_loader.main import get_size
4143

4244
from .gguf_reader import GGUFReader
@@ -134,11 +136,14 @@ def __init__(self, conf: Config):
134136
self.llama = None
135137
self.llama_model = None
136138
self.llama_cli: Optional[AsyncClient] = None
139+
137140
if FineTuner:
138141
self.fine_tuner = FineTuner(self.conf)
139142
else:
140143
self.fine_tuner = None
141144

145+
self.fast_embed = FastEmbed(self.conf)
146+
142147
def _gen_or_load_priv(self) -> None:
143148
if not self.conf.privkey:
144149
cfg = self.conf.config
@@ -215,6 +220,10 @@ async def guess_layers(self, model_path):
215220

216221
for gpu in info.nv_gpus:
217222
tot_mem += gpu.memory * 1000000
223+
224+
if tot_mem == 0:
225+
for gpu in info.cl_gpus:
226+
tot_mem += gpu.memory * 1000000
218227

219228
if est_ram > tot_mem:
220229
est_layers = tot_mem // (est_ram / layers)
@@ -227,9 +236,12 @@ async def guess_layers(self, model_path):
227236
return max(0, est_layers - self.conf.layer_offset)
228237

229238
async def load_model(self, name):
239+
assert name, "No model name"
230240
if name == self.llama_model:
231241
return
242+
232243
log.debug("loading model: %s", name)
244+
233245
model_path = await self.get_model(name)
234246

235247
if llama_cpp.server.app.llama:
@@ -243,6 +255,7 @@ async def load_model(self, name):
243255
embedding=True, cache=True, port=8181,
244256
main_gpu=self.conf.main_gpu, tensor_split=sp)
245257
self.llama = create_llama_app(settings)
258+
assert self.llama, "Load llama failed. Try lowering layers."
246259
self.llama_cli = AsyncClient(app=self.llama, base_url="http://test")
247260
self.llama_model = name
248261

@@ -256,6 +269,9 @@ def _get_connect_info(self) -> ConnectMessage:
256269
if self.fine_tuner:
257270
caps += ["llama-fine-tune"]
258271

272+
if self.fast_embed:
273+
caps += ["fast-embed"]
274+
259275
connect_msg = ConnectMessage(
260276
worker_version=VERSION,
261277
capabilities=caps,
@@ -366,6 +382,9 @@ async def run_one(self):
366382
async for event in self.fine_tuner.fine_tune(req.openai_req):
367383
await self.ws_send(json.dumps(event), True)
368384
await self.ws_send("{}")
385+
elif req.openai_url == "/v1/embeddings" and model.startswith(MODEL_PREFIX):
386+
res = self.fast_embed.embed(req.openai_req)
387+
await self.ws_send(json.dumps(res), True)
369388
elif req.openai_req.get("stream"):
370389
await self.load_model(model)
371390
async with aconnect_sse(self.llama_cli, "POST", req.openai_url, json=req.openai_req) as sse:

ai_worker/quantize_main.py

-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22

33
from ai_worker.util import quantize_gguf, GGML_INVERSE_MAP
44

5-
import argparse
6-
75
# List of quantization levels
86
quantization_levels = list(GGML_INVERSE_MAP.keys())
97

ai_worker/util.py

+9
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import base64
22
import hashlib
33
import os
4+
import tarfile
45

56
import llama_cpp
67

@@ -61,3 +62,11 @@ def url_to_tempfile(conf, url):
6162
name = hashlib.md5(url.encode()).hexdigest()
6263
output_file = os.path.join(conf.tmp_dir, name)
6364
return output_file
65+
66+
67+
def gzip(folder):
68+
"""tar gz the folder to 'folder.tar.gz', removes the folder"""
69+
base_folder_name = os.path.basename(folder)
70+
with tarfile.open(f"{folder}.tar.gz", 'w:gz') as archive:
71+
archive.add(folder, arcname=base_folder_name)
72+
return f"{folder}.tar.gz"

ai_worker/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
VERSION = '0.2.3'
1+
VERSION = '0.3.0'

build-bin.sh

+4-7
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ if [ "$gpu" == "cuda-torch" ]; then
1616
opts=""
1717
fi
1818

19+
with_onnx=""
20+
if [ "$gpu" == "cuda" ]; then
21+
with_torch="--with onnx"
22+
fi
1923

2024
set -o xtrace
2125

@@ -36,11 +40,4 @@ python build-version.py
3640

3741
./pyinstaller.sh $gpu-$arch $opts
3842

39-
if [ "$gpu" == "cuda-torch" ]; then
40-
pushd dist
41-
tar cvf - gputopia-worker-$gpu-$arch/ | pigz -9 - > gputopia-worker-$gpu-$arch.tar.gz
42-
rm -rf gputopia-worker-$gpu-$arch/
43-
popd
44-
fi
45-
4643
deactivate

build-mac.sh

100644100755
File mode changed.

build-windows.sh

100644100755
File mode changed.

gguf_loader/convert.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,9 @@
33

44
import argparse
55
import concurrent.futures
6-
import copy
76
import enum
87
import faulthandler
98
import functools
10-
import io
119
import itertools
1210
import json
1311
import math
@@ -23,12 +21,11 @@
2321
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
2422
from dataclasses import dataclass
2523
from pathlib import Path
26-
from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar
24+
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
2725

2826
import numpy as np
2927
from sentencepiece import SentencePieceProcessor # type: ignore[import]
3028

31-
import os
3229
from . import gguf
3330

3431
if TYPE_CHECKING:
@@ -335,7 +332,6 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No
335332

336333
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
337334
tokenizer = self.bpe_tokenizer
338-
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
339335
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
340336

341337
for i, _ in enumerate(tokenizer):
@@ -850,7 +846,7 @@ def add_meta_vocab(self, vocab: Vocab) -> None:
850846
elif isinstance(vocab, BpeVocab):
851847
self.gguf.add_tokenizer_model("gpt2")
852848
else:
853-
raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab')
849+
raise ValueError('Unknown vocab type: Not BpeVocab or SentencePieceVocab')
854850
self.gguf.add_token_list(tokens)
855851
self.gguf.add_token_scores(scores)
856852
self.gguf.add_token_types(toktypes)

gguf_loader/convert_llama_ggml_to_gguf.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
from __future__ import annotations
33

44
import argparse
5-
import math
65
import struct
76
import sys
87
from enum import IntEnum
@@ -425,7 +424,7 @@ def main():
425424
data = np.memmap(cfg.input, mode = 'r')
426425
model = GGMLModel()
427426
print('* Scanning GGML input file')
428-
offset = model.load(data, 0)
427+
model.load(data, 0)
429428
print(f'* GGML model hyperparameters: {model.hyperparameters}')
430429
vocab_override = None
431430
params_override = None

0 commit comments

Comments
 (0)