32
32
33
33
try :
34
34
from .fine_tune import FineTuner
35
- except ImportError as ex :
35
+ except ImportError :
36
36
if os .environ .get ("GPUTOPIA_DEBUG_IMPORT" ):
37
37
log .exception ("fine tuning not enabled" )
38
38
FineTuner = None
39
39
40
+ from .fast_embed import FastEmbed , MODEL_PREFIX
41
+
40
42
from gguf_loader .main import get_size
41
43
42
44
from .gguf_reader import GGUFReader
@@ -134,11 +136,14 @@ def __init__(self, conf: Config):
134
136
self .llama = None
135
137
self .llama_model = None
136
138
self .llama_cli : Optional [AsyncClient ] = None
139
+
137
140
if FineTuner :
138
141
self .fine_tuner = FineTuner (self .conf )
139
142
else :
140
143
self .fine_tuner = None
141
144
145
+ self .fast_embed = FastEmbed (self .conf )
146
+
142
147
def _gen_or_load_priv (self ) -> None :
143
148
if not self .conf .privkey :
144
149
cfg = self .conf .config
@@ -215,6 +220,10 @@ async def guess_layers(self, model_path):
215
220
216
221
for gpu in info .nv_gpus :
217
222
tot_mem += gpu .memory * 1000000
223
+
224
+ if tot_mem == 0 :
225
+ for gpu in info .cl_gpus :
226
+ tot_mem += gpu .memory * 1000000
218
227
219
228
if est_ram > tot_mem :
220
229
est_layers = tot_mem // (est_ram / layers )
@@ -227,9 +236,12 @@ async def guess_layers(self, model_path):
227
236
return max (0 , est_layers - self .conf .layer_offset )
228
237
229
238
async def load_model (self , name ):
239
+ assert name , "No model name"
230
240
if name == self .llama_model :
231
241
return
242
+
232
243
log .debug ("loading model: %s" , name )
244
+
233
245
model_path = await self .get_model (name )
234
246
235
247
if llama_cpp .server .app .llama :
@@ -243,6 +255,7 @@ async def load_model(self, name):
243
255
embedding = True , cache = True , port = 8181 ,
244
256
main_gpu = self .conf .main_gpu , tensor_split = sp )
245
257
self .llama = create_llama_app (settings )
258
+ assert self .llama , "Load llama failed. Try lowering layers."
246
259
self .llama_cli = AsyncClient (app = self .llama , base_url = "http://test" )
247
260
self .llama_model = name
248
261
@@ -256,6 +269,9 @@ def _get_connect_info(self) -> ConnectMessage:
256
269
if self .fine_tuner :
257
270
caps += ["llama-fine-tune" ]
258
271
272
+ if self .fast_embed :
273
+ caps += ["fast-embed" ]
274
+
259
275
connect_msg = ConnectMessage (
260
276
worker_version = VERSION ,
261
277
capabilities = caps ,
@@ -366,6 +382,9 @@ async def run_one(self):
366
382
async for event in self .fine_tuner .fine_tune (req .openai_req ):
367
383
await self .ws_send (json .dumps (event ), True )
368
384
await self .ws_send ("{}" )
385
+ elif req .openai_url == "/v1/embeddings" and model .startswith (MODEL_PREFIX ):
386
+ res = self .fast_embed .embed (req .openai_req )
387
+ await self .ws_send (json .dumps (res ), True )
369
388
elif req .openai_req .get ("stream" ):
370
389
await self .load_model (model )
371
390
async with aconnect_sse (self .llama_cli , "POST" , req .openai_url , json = req .openai_req ) as sse :
0 commit comments