From 52e0ef495dacd28cd910d4dc54715aa3c1acc242 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 30 Apr 2024 14:28:45 +0800 Subject: [PATCH 01/15] test --- main.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/main.py b/main.py index d5d9b0e..5a9ed56 100644 --- a/main.py +++ b/main.py @@ -7,8 +7,6 @@ import uuid import asyncio import os -from dotenv import load_dotenv -load_dotenv() app = FastAPI() @@ -164,4 +162,4 @@ async def embeddings(request: Request): if __name__ == "__main__": import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8090) \ No newline at end of file + uvicorn.run(app, host="0.0.0.0", port=8090) From 0ccbe328698a0951dc59caad28b26ae66c7b3bf6 Mon Sep 17 00:00:00 2001 From: liuhetian <91518757+liuhetian@users.noreply.github.com> Date: Tue, 30 Apr 2024 14:31:44 +0800 Subject: [PATCH 02/15] Update main.py --- main.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/main.py b/main.py index 5a9ed56..6694d1c 100644 --- a/main.py +++ b/main.py @@ -18,37 +18,38 @@ allow_headers=["*"], ) +time_to_sleep = 0 -def data_generator(): +async def data_generator(): response_id = uuid.uuid4().hex sentence = "Hello this is a test response from a fixed OpenAI endpoint." words = sentence.split(" ") for word in words: word = word + " " chunk = { - "id": f"chatcmpl-{response_id}", - "object": "chat.completion.chunk", - "created": 1677652288, - "model": "gpt-3.5-turbo-0125", - "choices": [{"index": 0, "delta": {"content": word}}], - } + "id": f"chatcmpl-{response_id}", + "object": "chat.completion.chunk", + "created": 1677652288, + "model": "gpt-3.5-turbo-0125", + "choices": [{"index": 0, "delta": {"content": word}}], + } try: yield f"data: {json.dumps(chunk.dict())}\n\n" except: yield f"data: {json.dumps(chunk)}\n\n" - + await asyncio.sleep(1) # for completion @app.post("/chat/completions") @app.post("/v1/chat/completions") @app.post("/openai/deployments/{model:path}/chat/completions") # azure compatible endpoint async def completion(request: Request): - _time_to_sleep = os.getenv("TIME_TO_SLEEP", None) - if _time_to_sleep is not None: - print("sleeping for " + _time_to_sleep) - await asyncio.sleep(float(_time_to_sleep)) + if time_to_sleep: + print("sleeping for " + time_to_sleep) + await asyncio.sleep(float(time_to_sleep)) data = await request.json() + print(data) if data.get("stream") == True: return StreamingResponse( From bbe8ac914f6c8873b27eae5a8d5050b60a456ffd Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 30 Apr 2024 14:43:18 +0800 Subject: [PATCH 03/15] fix --- main.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 6694d1c..9babca7 100644 --- a/main.py +++ b/main.py @@ -18,7 +18,8 @@ allow_headers=["*"], ) -time_to_sleep = 0 +time_to_sleep = 1 +time_to_sleep_steam = 0.3 async def data_generator(): response_id = uuid.uuid4().hex @@ -37,7 +38,8 @@ async def data_generator(): yield f"data: {json.dumps(chunk.dict())}\n\n" except: yield f"data: {json.dumps(chunk)}\n\n" - await asyncio.sleep(1) + if time_to_sleep_steam: + await asyncio.sleep(time_to_sleep_stream) # for completion @app.post("/chat/completions") @@ -45,7 +47,7 @@ async def data_generator(): @app.post("/openai/deployments/{model:path}/chat/completions") # azure compatible endpoint async def completion(request: Request): if time_to_sleep: - print("sleeping for " + time_to_sleep) + print(f"sleeping for {time_to_sleep}") await asyncio.sleep(float(time_to_sleep)) data = await request.json() @@ -163,4 +165,4 @@ async def embeddings(request: Request): if __name__ == "__main__": import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8090) + uvicorn.run(app, host="0.0.0.0", port=8000) From b6928f333b2d7e2d1871af6dcbc6bd20c942e148 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 30 Apr 2024 17:40:07 +0800 Subject: [PATCH 04/15] fix --- main.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index 9babca7..88b0e11 100644 --- a/main.py +++ b/main.py @@ -7,7 +7,7 @@ import uuid import asyncio import os - +import time app = FastAPI() app.add_middleware( @@ -19,7 +19,7 @@ ) time_to_sleep = 1 -time_to_sleep_steam = 0.3 +time_to_sleep_stream = 0.3 async def data_generator(): response_id = uuid.uuid4().hex @@ -38,7 +38,7 @@ async def data_generator(): yield f"data: {json.dumps(chunk.dict())}\n\n" except: yield f"data: {json.dumps(chunk)}\n\n" - if time_to_sleep_steam: + if time_to_sleep_stream: await asyncio.sleep(time_to_sleep_stream) # for completion @@ -47,11 +47,12 @@ async def data_generator(): @app.post("/openai/deployments/{model:path}/chat/completions") # azure compatible endpoint async def completion(request: Request): if time_to_sleep: - print(f"sleeping for {time_to_sleep}") + # print(f"sleeping for {time_to_sleep}") await asyncio.sleep(float(time_to_sleep)) data = await request.json() - print(data) + #print(data) + print(time.time()) if data.get("stream") == True: return StreamingResponse( From d60eb62eec08fa58978e3d7a80138ad8d5208ade Mon Sep 17 00:00:00 2001 From: liuhetian <91518757+liuhetian@users.noreply.github.com> Date: Tue, 30 Apr 2024 17:43:02 +0800 Subject: [PATCH 05/15] Update main.py --- main.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/main.py b/main.py index 88b0e11..2060f24 100644 --- a/main.py +++ b/main.py @@ -2,12 +2,14 @@ from fastapi.responses import StreamingResponse from fastapi.security import OAuth2PasswordBearer from fastapi.middleware.cors import CORSMiddleware +from loguru import logger import asyncio import json import uuid import asyncio import os import time + app = FastAPI() app.add_middleware( @@ -21,6 +23,19 @@ time_to_sleep = 1 time_to_sleep_stream = 0.3 +logger.add('error.log', level=40) + +@app.exception_handler(Exception) +async def custom_exception_handler(request: Request, exc: Exception): + # 记录异常信息 + logger.error("Uncaught exception: {0}".format(str(exc))) + # 返回通用异常响应 + return JSONResponse( + status_code=500, + content={"message": "An unexpected error occurred"}, + ) + + async def data_generator(): response_id = uuid.uuid4().hex sentence = "Hello this is a test response from a fixed OpenAI endpoint." From 9f12466bd220c0ad6ebbadf380051caa582a649a Mon Sep 17 00:00:00 2001 From: liuhetian <91518757+liuhetian@users.noreply.github.com> Date: Tue, 30 Apr 2024 17:58:20 +0800 Subject: [PATCH 06/15] Update main.py --- main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/main.py b/main.py index 2060f24..d2207fa 100644 --- a/main.py +++ b/main.py @@ -2,6 +2,7 @@ from fastapi.responses import StreamingResponse from fastapi.security import OAuth2PasswordBearer from fastapi.middleware.cors import CORSMiddleware +from starlette.responses import JSONResponse from loguru import logger import asyncio import json From 10445167c95839ae1334edf8d4f7dc6dda1066d7 Mon Sep 17 00:00:00 2001 From: liuhetian <91518757+liuhetian@users.noreply.github.com> Date: Tue, 30 Apr 2024 19:45:12 +0800 Subject: [PATCH 07/15] Update main.py --- main.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index d2207fa..fd0f21e 100644 --- a/main.py +++ b/main.py @@ -10,6 +10,11 @@ import asyncio import os import time +from langchain.text_splitter import TokenTextSplitter + + + + app = FastAPI() @@ -39,10 +44,14 @@ async def custom_exception_handler(request: Request, exc: Exception): async def data_generator(): response_id = uuid.uuid4().hex - sentence = "Hello this is a test response from a fixed OpenAI endpoint." + sentence = "Hello this is a test response from a fixed OpenAI endpoint. " * 5 words = sentence.split(" ") + + a = TokenTextSplitter(model_name="gpt-3.5-turbo", chunk_size=1, chunk_overlap=0) + words = a.split_text(sentence) + for word in words: - word = word + " " + word = word chunk = { "id": f"chatcmpl-{response_id}", "object": "chat.completion.chunk", From 9c938e137b1f33bd4dff464fe9b05ecd31ec9c3c Mon Sep 17 00:00:00 2001 From: liuhetian <91518757+liuhetian@users.noreply.github.com> Date: Mon, 6 May 2024 09:59:20 +0800 Subject: [PATCH 08/15] Update main.py --- main.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index fd0f21e..bf1e968 100644 --- a/main.py +++ b/main.py @@ -10,6 +10,7 @@ import asyncio import os import time +import tiktoken from langchain.text_splitter import TokenTextSplitter @@ -44,14 +45,15 @@ async def custom_exception_handler(request: Request, exc: Exception): async def data_generator(): response_id = uuid.uuid4().hex - sentence = "Hello this is a test response from a fixed OpenAI endpoint. " * 5 - words = sentence.split(" ") + sentence = "花香蕉的钱,只能请到猴子. " * 5 - a = TokenTextSplitter(model_name="gpt-3.5-turbo", chunk_size=1, chunk_overlap=0) - words = a.split_text(sentence) + # a = TokenTextSplitter(model_name="gpt-3.5-turbo", chunk_size=1, chunk_overlap=0) + # words = a.split_text(sentence) + encoding = tiktoken.get_encoding("cl100k_base") + token_integers = encoding.encode(sentence) + words = [encoding.decode_single_token_bytes(token) for token in token_integers] for word in words: - word = word chunk = { "id": f"chatcmpl-{response_id}", "object": "chat.completion.chunk", From 5a7ca33b0a83d0ef1952dac8cc3487fac285246e Mon Sep 17 00:00:00 2001 From: liuhetian <91518757+liuhetian@users.noreply.github.com> Date: Mon, 6 May 2024 10:08:57 +0800 Subject: [PATCH 09/15] Update main.py --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index bf1e968..5aa6571 100644 --- a/main.py +++ b/main.py @@ -62,7 +62,7 @@ async def data_generator(): "choices": [{"index": 0, "delta": {"content": word}}], } try: - yield f"data: {json.dumps(chunk.dict())}\n\n" + yield f"data: {json.dumps(chunk)}\n\n" except: yield f"data: {json.dumps(chunk)}\n\n" if time_to_sleep_stream: From 827bef74283d5d6338396c511af4a7e74e649a16 Mon Sep 17 00:00:00 2001 From: liuhetian <91518757+liuhetian@users.noreply.github.com> Date: Mon, 6 May 2024 15:08:44 +0800 Subject: [PATCH 10/15] Update main.py --- main.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index 5aa6571..e5fa745 100644 --- a/main.py +++ b/main.py @@ -42,6 +42,22 @@ async def custom_exception_handler(request: Request, exc: Exception): content={"message": "An unexpected error occurred"}, ) +def fix_incomplete_utf8(words): + combined = bytearray() + fixed_words = [] + for word in words: + try: + combined.extend(word) + # 尝试解码来检查是否是完整的UTF-8字符 + combined.decode('utf-8') + fixed_words.append(bytes(combined)) + combined.clear() + except UnicodeDecodeError: + continue # 如果抛出解码错误,继续添加字节直到可以解码为止 + if combined: + fixed_words.append(bytes(combined)) # 添加最后的字节序列 + return fixed_words + async def data_generator(): response_id = uuid.uuid4().hex @@ -52,19 +68,16 @@ async def data_generator(): encoding = tiktoken.get_encoding("cl100k_base") token_integers = encoding.encode(sentence) words = [encoding.decode_single_token_bytes(token) for token in token_integers] - - for word in words: + fixed_words = fix_incomplete_utf8(words) + for word in fixed_words: chunk = { "id": f"chatcmpl-{response_id}", "object": "chat.completion.chunk", "created": 1677652288, "model": "gpt-3.5-turbo-0125", - "choices": [{"index": 0, "delta": {"content": word}}], + "choices": [{"index": 0, "delta": {"content": word.decode('utf-8')}}], } - try: - yield f"data: {json.dumps(chunk)}\n\n" - except: - yield f"data: {json.dumps(chunk)}\n\n" + yield f"data: {json.dumps(chunk)}\n\n" if time_to_sleep_stream: await asyncio.sleep(time_to_sleep_stream) From a1acafcae0eab27d4f6a93420ec11de45472503a Mon Sep 17 00:00:00 2001 From: liuhetian <91518757+liuhetian@users.noreply.github.com> Date: Mon, 6 May 2024 15:12:38 +0800 Subject: [PATCH 11/15] Update main.py --- main.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/main.py b/main.py index e5fa745..9073089 100644 --- a/main.py +++ b/main.py @@ -11,11 +11,7 @@ import os import time import tiktoken -from langchain.text_splitter import TokenTextSplitter - - - - +# from langchain.text_splitter import TokenTextSplitter app = FastAPI() @@ -62,7 +58,6 @@ def fix_incomplete_utf8(words): async def data_generator(): response_id = uuid.uuid4().hex sentence = "花香蕉的钱,只能请到猴子. " * 5 - # a = TokenTextSplitter(model_name="gpt-3.5-turbo", chunk_size=1, chunk_overlap=0) # words = a.split_text(sentence) encoding = tiktoken.get_encoding("cl100k_base") From 5b0570442e54ec24abd30e18e9ed29b41729f5d4 Mon Sep 17 00:00:00 2001 From: liuhetian <91518757+liuhetian@users.noreply.github.com> Date: Mon, 6 May 2024 15:12:55 +0800 Subject: [PATCH 12/15] Update main.py --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 9073089..6636e96 100644 --- a/main.py +++ b/main.py @@ -24,7 +24,7 @@ ) time_to_sleep = 1 -time_to_sleep_stream = 0.3 +time_to_sleep_stream = 2 logger.add('error.log', level=40) From 2447fba8ad6fb90a2a7438b0e212a8d51855135b Mon Sep 17 00:00:00 2001 From: liuhetian <91518757+liuhetian@users.noreply.github.com> Date: Mon, 6 May 2024 15:21:51 +0800 Subject: [PATCH 13/15] Update main.py --- main.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 6636e96..87d019b 100644 --- a/main.py +++ b/main.py @@ -55,9 +55,8 @@ def fix_incomplete_utf8(words): return fixed_words -async def data_generator(): +async def data_generator(sentence = "花香蕉的钱,只能请到猴子. " * 5): response_id = uuid.uuid4().hex - sentence = "花香蕉的钱,只能请到猴子. " * 5 # a = TokenTextSplitter(model_name="gpt-3.5-turbo", chunk_size=1, chunk_overlap=0) # words = a.split_text(sentence) encoding = tiktoken.get_encoding("cl100k_base") @@ -91,7 +90,7 @@ async def completion(request: Request): if data.get("stream") == True: return StreamingResponse( - content=data_generator(), + content=data_generator(data['messages'][0]['content']), media_type="text/event-stream", ) else: From b969182b2f66e8544e849a4415ec71370f6e1e09 Mon Sep 17 00:00:00 2001 From: liuhetian <91518757+liuhetian@users.noreply.github.com> Date: Wed, 8 May 2024 12:36:52 +0800 Subject: [PATCH 14/15] Update README.md --- README.md | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/README.md b/README.md index 517fa55..0db80e6 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,46 @@ # example_openai_endpoint An example OpenAI /chat/completions endpoint + +## use +you can use my endpoint + +### openai + +```python +import os +from openai import OpenAI + +client = OpenAI( + base_url='http://fakeapi.liuhetian.work/v1', + api_key='sk-anytokenisok', +) + +chat_completion = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": "Say this is a test", + } + ], + model="gpt-3.5-turbo", +) +print(chat_completion.choices[0].message.content) + +``` + +### langchain +``` python +from langchain_openai import ChatOpenAI + +llm = ChatOpenAI( + base_url='http://fakeapi.liuhetian.work/v1', + api_key='sk-anytokenisok', + model='fake-openai', + max_retries=0, +) + +for i in llm.stream("the endpoint will only repeat what I say token by token(not words), Neurocomputational Multidimensionality"): + print(i.content, end='') + +llm.invoke('Hello World') +``` From a5c43be631db81b9121902f74739ab1c1618da85 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 8 May 2024 14:56:42 +0800 Subject: [PATCH 15/15] =?UTF-8?q?=E6=8C=87=E5=AE=9Abug=EF=BC=8C=E6=8C=87?= =?UTF-8?q?=E5=AE=9A=E5=9B=9E=E5=A4=8D=E5=86=85=E5=AE=B9=EF=BC=8C=E6=8C=87?= =?UTF-8?q?=E5=AE=9A=E5=93=8D=E5=BA=94=E6=97=B6=E9=97=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 99 +++++++++++++++++++++++++++++++++++---- error.log | 13 ++++++ main.py | 137 +++++++++++++++++++++++++++++++++++------------------- 3 files changed, 194 insertions(+), 55 deletions(-) create mode 100644 error.log diff --git a/README.md b/README.md index 0db80e6..ebea899 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # example_openai_endpoint -An example OpenAI /chat/completions endpoint +An example OpenAI /chat/completions endpoint, can simulate some exceptions that occur -## use -you can use my endpoint +## Quickstart +just use my endpoint ### openai @@ -12,9 +12,9 @@ from openai import OpenAI client = OpenAI( base_url='http://fakeapi.liuhetian.work/v1', - api_key='sk-anytokenisok', + api_key='sk-anythingisok', # doesn't matter + max_retries=0, ) - chat_completion = client.chat.completions.create( messages=[ { @@ -25,7 +25,6 @@ chat_completion = client.chat.completions.create( model="gpt-3.5-turbo", ) print(chat_completion.choices[0].message.content) - ``` ### langchain @@ -34,8 +33,8 @@ from langchain_openai import ChatOpenAI llm = ChatOpenAI( base_url='http://fakeapi.liuhetian.work/v1', - api_key='sk-anytokenisok', - model='fake-openai', + api_key='sk-anythingisok', + model='fake-openai', # doesn't matter max_retries=0, ) @@ -44,3 +43,87 @@ for i in llm.stream("the endpoint will only repeat what I say token by token(not llm.invoke('Hello World') ``` + +## Customize response + +### Customize reply +you may want to customize the reply of llm,\n\nthis endpoint will repeat the first sentence you say without customize +```python +from openai import OpenAI +import openai + +client = OpenAI( + base_url='http://fakeapi.liuhetian.work/v1', + api_key='sk-anytokenisok', + max_retries=0, +) +chat_completion = client.chat.completions.create( + messages = [ + { + "role": "user", + "content": "Say this is a test", + } + ], + model="gpt-3.5-turbo", + extra_body={'fk_reply': 'this is a test'} +) +print(chat_completion.choices[0].message.content) +``` + + +### Customize error +If you want to simulate some exceptions that may occur, you can add a param `extra_body={'fk_error': 429}` + +```python +from openai import OpenAI +import openai + +client = OpenAI( + base_url='http://fakeapi.liuhetian.work/v1', + api_key='sk-anytokenisok', + max_retries=0, + +) +try: + chat_completion = client.chat.completions.create( + messages = [ + { + "role": "user", + "content": "Say this is a test", + } + ], + model="gpt-3.5-turbo", + extra_body={'fk_error': 429} # 500 + ) +except openai.APIConnectionError as e: + print("The server could not be reached") + print(e.__cause__) # an underlying Exception, likely raised within httpx. +except openai.RateLimitError as e: + print("A 429 status code was received; we should back off a bit.") +except openai.APIStatusError as e: + print("Another non-200-range status code was received") + print(e.status_code) + print(e.response) +``` + +### Customized response speed + +Use stream to return results according to a single token. By setting parameter a, you can clearly see the content of each token. + +```python +from langchain_openai import ChatOpenAI +llm = ChatOpenAI( + base_url='http://fakeapi.liuhetian.work/v1', + api_key='sk-anytokenisok', + model='fake-openai', + max_retries=0, +) +for i in llm.bind(extra_body={'fk_time_to_sleep': 0, 'fk_time_to_sleep_stream': 2}).stream( + "the endpoint will only repeat what I say token by token(not words)," + " for example: Neurocomputational Multidimensionality" +): + print(i.content, end='') + +# fk_time_to_sleep is first token response time +# fk_time_to_sleep_stream if response time after previous token +``` \ No newline at end of file diff --git a/error.log b/error.log new file mode 100644 index 0000000..433dcc8 --- /dev/null +++ b/error.log @@ -0,0 +1,13 @@ +2024-05-08 13:57:40.905 | ERROR | __main__:custom_exception_handler:34 - Uncaught exception: division by zero +2024-05-08 13:57:42.331 | ERROR | __main__:custom_exception_handler:34 - Uncaught exception: division by zero +2024-05-08 13:57:44.803 | ERROR | __main__:custom_exception_handler:34 - Uncaught exception: division by zero +2024-05-08 14:01:54.818 | ERROR | __main__:custom_exception_handler:34 - Uncaught exception: division by zero +2024-05-08 14:03:38.980 | ERROR | __main__:custom_exception_handler:34 - Uncaught exception: division by zero +2024-05-08 14:08:42.010 | ERROR | __main__:custom_exception_handler:34 - Uncaught exception: division by zero +2024-05-08 14:13:19.102 | ERROR | __main__:custom_exception_handler:44 - Uncaught exception: Fake Internal Server Error +2024-05-08 14:13:26.087 | ERROR | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit. +2024-05-08 14:15:07.260 | ERROR | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit. +2024-05-08 14:15:32.304 | ERROR | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit. +2024-05-08 14:15:44.404 | ERROR | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit. +2024-05-08 14:16:44.675 | ERROR | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit. +2024-05-08 14:48:04.068 | ERROR | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit. diff --git a/main.py b/main.py index 87d019b..cccfffe 100644 --- a/main.py +++ b/main.py @@ -23,11 +23,21 @@ allow_headers=["*"], ) -time_to_sleep = 1 -time_to_sleep_stream = 2 +class RateLimitError(Exception): + ... logger.add('error.log', level=40) +@app.exception_handler(RateLimitError) +async def rl_exception_handler(request: Request, exc: RateLimitError): + # 记录异常信息 + logger.error("Uncaught exception: {0}".format(str(exc))) + # 返回通用异常响应 + return JSONResponse( + status_code=429, + content={"message": str(exc)}, + ) + @app.exception_handler(Exception) async def custom_exception_handler(request: Request, exc: Exception): # 记录异常信息 @@ -35,7 +45,7 @@ async def custom_exception_handler(request: Request, exc: Exception): # 返回通用异常响应 return JSONResponse( status_code=500, - content={"message": "An unexpected error occurred"}, + content={"message": str(exc)}, ) def fix_incomplete_utf8(words): @@ -53,9 +63,28 @@ def fix_incomplete_utf8(words): if combined: fixed_words.append(bytes(combined)) # 添加最后的字节序列 return fixed_words + +def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0125"): + """Returns the number of tokens used by a list of messages.""" + try: + encoding = tiktoken.encoding_for_model(model) + except KeyError: + encoding = tiktoken.get_encoding("cl100k_base") + if model == "gpt-3.5-turbo-0125": # note: future models may deviate from this + num_tokens = 0 + for message in messages: + num_tokens += 4 # every message follows {role/name}\n{content}\n + for key, value in message.items(): + num_tokens += len(encoding.encode(value)) + if key == "name": # if there's a name, the role is omitted + num_tokens += -1 # role is always required and always 1 token + num_tokens += 2 # every reply is primed with assistant + return num_tokens + else: + raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}.""") -async def data_generator(sentence = "花香蕉的钱,只能请到猴子. " * 5): +async def data_generator(sentence="花香蕉的钱,只能请到猴子. ", time_to_sleep_stream=2): response_id = uuid.uuid4().hex # a = TokenTextSplitter(model_name="gpt-3.5-turbo", chunk_size=1, chunk_overlap=0) # words = a.split_text(sentence) @@ -80,17 +109,27 @@ async def data_generator(sentence = "花香蕉的钱,只能请到猴子. " * 5 @app.post("/v1/chat/completions") @app.post("/openai/deployments/{model:path}/chat/completions") # azure compatible endpoint async def completion(request: Request): - if time_to_sleep: - # print(f"sleeping for {time_to_sleep}") - await asyncio.sleep(float(time_to_sleep)) data = await request.json() - #print(data) - print(time.time()) + print(data) + await asyncio.sleep(float(data.get('fk_time_to_sleep', 0.1))) + + fk_error = data.get('fk_error') + if fk_error == 500: + raise ValueError('Fake Internal Server Error') + if fk_error == 429: + raise RateLimitError("A 429 status code was received; we should back off a bit.") + + fk_reply = data.get('fk_reply', data['messages'][0]['content']) or 'You is my friend!' + prompt_tokens = num_tokens_from_messages(data['messages']) + encoding = tiktoken.get_encoding("cl100k_base") + completion_tokens = len(encoding.encode(fk_reply)) + total_tokens = prompt_tokens + completion_tokens if data.get("stream") == True: + fk_time_to_sleep_stream = float(data.get('fk_time_to_sleep_stream', 0.1)) return StreamingResponse( - content=data_generator(data['messages'][0]['content']), + content=data_generator(fk_reply, fk_time_to_sleep_stream), media_type="text/event-stream", ) else: @@ -106,52 +145,56 @@ async def completion(request: Request): "index": 0, "message": { "role": "assistant", - "content": "\n\nHello there, how may I assist you today?", + "content": fk_reply, }, "logprobs": None, "finish_reason": "stop", } ], - "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21}, + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens + }, } return response -# for completion -@app.post("/completions") -@app.post("/v1/completions") -async def text_completion(request: Request): - data = await request.json() - - if data.get("stream") == True: - return StreamingResponse( - content=data_generator(), - media_type="text/event-stream", - ) - else: - response_id = uuid.uuid4().hex - response = { - "id": "cmpl-9B2ycsf0odECdLmrVzm2y8Q12csjW", - "choices": [ - { - "finish_reason": "length", - "index": 0, - "logprobs": None, - "text": "\n\nA test request, how intriguing\nAn invitation for knowledge bringing\nWith words" - } - ], - "created": 1712420078, - "model": "gpt-3.5-turbo-instruct-0914", - "object": "text_completion", - "system_fingerprint": None, - "usage": { - "completion_tokens": 16, - "prompt_tokens": 10, - "total_tokens": 26 - } - } - - return response +# # for completion +# @app.post("/completions") +# @app.post("/v1/completions") +# async def text_completion(request: Request): +# data = await request.json() + +# if data.get("stream") == True: +# return StreamingResponse( +# content=data_generator(), +# media_type="text/event-stream", +# ) +# else: +# response_id = uuid.uuid4().hex +# response = { +# "id": "cmpl-9B2ycsf0odECdLmrVzm2y8Q12csjW", +# "choices": [ +# { +# "finish_reason": "length", +# "index": 0, +# "logprobs": None, +# "text": "\n\nA test request, how intriguing\nAn invitation for knowledge bringing\nWith words" +# } +# ], +# "created": 1712420078, +# "model": "gpt-3.5-turbo-instruct-0914", +# "object": "text_completion", +# "system_fingerprint": None, +# "usage": { +# "completion_tokens": 16, +# "prompt_tokens": 10, +# "total_tokens": 26 +# } +# } + +# return response