diff --git a/README.md b/README.md index 517fa55..ebea899 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,129 @@ # example_openai_endpoint -An example OpenAI /chat/completions endpoint +An example OpenAI /chat/completions endpoint, can simulate some exceptions that occur + +## Quickstart +just use my endpoint + +### openai + +```python +import os +from openai import OpenAI + +client = OpenAI( + base_url='http://fakeapi.liuhetian.work/v1', + api_key='sk-anythingisok', # doesn't matter + max_retries=0, +) +chat_completion = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": "Say this is a test", + } + ], + model="gpt-3.5-turbo", +) +print(chat_completion.choices[0].message.content) +``` + +### langchain +``` python +from langchain_openai import ChatOpenAI + +llm = ChatOpenAI( + base_url='http://fakeapi.liuhetian.work/v1', + api_key='sk-anythingisok', + model='fake-openai', # doesn't matter + max_retries=0, +) + +for i in llm.stream("the endpoint will only repeat what I say token by token(not words), Neurocomputational Multidimensionality"): + print(i.content, end='') + +llm.invoke('Hello World') +``` + +## Customize response + +### Customize reply +you may want to customize the reply of llm,\n\nthis endpoint will repeat the first sentence you say without customize +```python +from openai import OpenAI +import openai + +client = OpenAI( + base_url='http://fakeapi.liuhetian.work/v1', + api_key='sk-anytokenisok', + max_retries=0, +) +chat_completion = client.chat.completions.create( + messages = [ + { + "role": "user", + "content": "Say this is a test", + } + ], + model="gpt-3.5-turbo", + extra_body={'fk_reply': 'this is a test'} +) +print(chat_completion.choices[0].message.content) +``` + + +### Customize error +If you want to simulate some exceptions that may occur, you can add a param `extra_body={'fk_error': 429}` + +```python +from openai import OpenAI +import openai + +client = OpenAI( + base_url='http://fakeapi.liuhetian.work/v1', + api_key='sk-anytokenisok', + max_retries=0, + +) +try: + chat_completion = client.chat.completions.create( + messages = [ + { + "role": "user", + "content": "Say this is a test", + } + ], + model="gpt-3.5-turbo", + extra_body={'fk_error': 429} # 500 + ) +except openai.APIConnectionError as e: + print("The server could not be reached") + print(e.__cause__) # an underlying Exception, likely raised within httpx. +except openai.RateLimitError as e: + print("A 429 status code was received; we should back off a bit.") +except openai.APIStatusError as e: + print("Another non-200-range status code was received") + print(e.status_code) + print(e.response) +``` + +### Customized response speed + +Use stream to return results according to a single token. By setting parameter a, you can clearly see the content of each token. + +```python +from langchain_openai import ChatOpenAI +llm = ChatOpenAI( + base_url='http://fakeapi.liuhetian.work/v1', + api_key='sk-anytokenisok', + model='fake-openai', + max_retries=0, +) +for i in llm.bind(extra_body={'fk_time_to_sleep': 0, 'fk_time_to_sleep_stream': 2}).stream( + "the endpoint will only repeat what I say token by token(not words)," + " for example: Neurocomputational Multidimensionality" +): + print(i.content, end='') + +# fk_time_to_sleep is first token response time +# fk_time_to_sleep_stream if response time after previous token +``` \ No newline at end of file diff --git a/error.log b/error.log new file mode 100644 index 0000000..433dcc8 --- /dev/null +++ b/error.log @@ -0,0 +1,13 @@ +2024-05-08 13:57:40.905 | ERROR | __main__:custom_exception_handler:34 - Uncaught exception: division by zero +2024-05-08 13:57:42.331 | ERROR | __main__:custom_exception_handler:34 - Uncaught exception: division by zero +2024-05-08 13:57:44.803 | ERROR | __main__:custom_exception_handler:34 - Uncaught exception: division by zero +2024-05-08 14:01:54.818 | ERROR | __main__:custom_exception_handler:34 - Uncaught exception: division by zero +2024-05-08 14:03:38.980 | ERROR | __main__:custom_exception_handler:34 - Uncaught exception: division by zero +2024-05-08 14:08:42.010 | ERROR | __main__:custom_exception_handler:34 - Uncaught exception: division by zero +2024-05-08 14:13:19.102 | ERROR | __main__:custom_exception_handler:44 - Uncaught exception: Fake Internal Server Error +2024-05-08 14:13:26.087 | ERROR | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit. +2024-05-08 14:15:07.260 | ERROR | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit. +2024-05-08 14:15:32.304 | ERROR | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit. +2024-05-08 14:15:44.404 | ERROR | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit. +2024-05-08 14:16:44.675 | ERROR | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit. +2024-05-08 14:48:04.068 | ERROR | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit. diff --git a/main.py b/main.py index d5d9b0e..cccfffe 100644 --- a/main.py +++ b/main.py @@ -2,13 +2,16 @@ from fastapi.responses import StreamingResponse from fastapi.security import OAuth2PasswordBearer from fastapi.middleware.cors import CORSMiddleware +from starlette.responses import JSONResponse +from loguru import logger import asyncio import json import uuid import asyncio import os -from dotenv import load_dotenv -load_dotenv() +import time +import tiktoken +# from langchain.text_splitter import TokenTextSplitter app = FastAPI() @@ -20,41 +23,113 @@ allow_headers=["*"], ) - -def data_generator(): - response_id = uuid.uuid4().hex - sentence = "Hello this is a test response from a fixed OpenAI endpoint." - words = sentence.split(" ") +class RateLimitError(Exception): + ... + +logger.add('error.log', level=40) + +@app.exception_handler(RateLimitError) +async def rl_exception_handler(request: Request, exc: RateLimitError): + # 记录异常信息 + logger.error("Uncaught exception: {0}".format(str(exc))) + # 返回通用异常响应 + return JSONResponse( + status_code=429, + content={"message": str(exc)}, + ) + +@app.exception_handler(Exception) +async def custom_exception_handler(request: Request, exc: Exception): + # 记录异常信息 + logger.error("Uncaught exception: {0}".format(str(exc))) + # 返回通用异常响应 + return JSONResponse( + status_code=500, + content={"message": str(exc)}, + ) + +def fix_incomplete_utf8(words): + combined = bytearray() + fixed_words = [] for word in words: - word = word + " " - chunk = { - "id": f"chatcmpl-{response_id}", - "object": "chat.completion.chunk", - "created": 1677652288, - "model": "gpt-3.5-turbo-0125", - "choices": [{"index": 0, "delta": {"content": word}}], - } try: - yield f"data: {json.dumps(chunk.dict())}\n\n" - except: - yield f"data: {json.dumps(chunk)}\n\n" + combined.extend(word) + # 尝试解码来检查是否是完整的UTF-8字符 + combined.decode('utf-8') + fixed_words.append(bytes(combined)) + combined.clear() + except UnicodeDecodeError: + continue # 如果抛出解码错误,继续添加字节直到可以解码为止 + if combined: + fixed_words.append(bytes(combined)) # 添加最后的字节序列 + return fixed_words + +def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0125"): + """Returns the number of tokens used by a list of messages.""" + try: + encoding = tiktoken.encoding_for_model(model) + except KeyError: + encoding = tiktoken.get_encoding("cl100k_base") + if model == "gpt-3.5-turbo-0125": # note: future models may deviate from this + num_tokens = 0 + for message in messages: + num_tokens += 4 # every message follows {role/name}\n{content}\n + for key, value in message.items(): + num_tokens += len(encoding.encode(value)) + if key == "name": # if there's a name, the role is omitted + num_tokens += -1 # role is always required and always 1 token + num_tokens += 2 # every reply is primed with assistant + return num_tokens + else: + raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}.""") + +async def data_generator(sentence="花香蕉的钱,只能请到猴子. ", time_to_sleep_stream=2): + response_id = uuid.uuid4().hex + # a = TokenTextSplitter(model_name="gpt-3.5-turbo", chunk_size=1, chunk_overlap=0) + # words = a.split_text(sentence) + encoding = tiktoken.get_encoding("cl100k_base") + token_integers = encoding.encode(sentence) + words = [encoding.decode_single_token_bytes(token) for token in token_integers] + fixed_words = fix_incomplete_utf8(words) + for word in fixed_words: + chunk = { + "id": f"chatcmpl-{response_id}", + "object": "chat.completion.chunk", + "created": 1677652288, + "model": "gpt-3.5-turbo-0125", + "choices": [{"index": 0, "delta": {"content": word.decode('utf-8')}}], + } + yield f"data: {json.dumps(chunk)}\n\n" + if time_to_sleep_stream: + await asyncio.sleep(time_to_sleep_stream) # for completion @app.post("/chat/completions") @app.post("/v1/chat/completions") @app.post("/openai/deployments/{model:path}/chat/completions") # azure compatible endpoint async def completion(request: Request): - _time_to_sleep = os.getenv("TIME_TO_SLEEP", None) - if _time_to_sleep is not None: - print("sleeping for " + _time_to_sleep) - await asyncio.sleep(float(_time_to_sleep)) data = await request.json() + print(data) + await asyncio.sleep(float(data.get('fk_time_to_sleep', 0.1))) + + fk_error = data.get('fk_error') + if fk_error == 500: + raise ValueError('Fake Internal Server Error') + if fk_error == 429: + raise RateLimitError("A 429 status code was received; we should back off a bit.") + + fk_reply = data.get('fk_reply', data['messages'][0]['content']) or 'You is my friend!' + prompt_tokens = num_tokens_from_messages(data['messages']) + encoding = tiktoken.get_encoding("cl100k_base") + completion_tokens = len(encoding.encode(fk_reply)) + total_tokens = prompt_tokens + completion_tokens if data.get("stream") == True: + fk_time_to_sleep_stream = float(data.get('fk_time_to_sleep_stream', 0.1)) return StreamingResponse( - content=data_generator(), + content=data_generator(fk_reply, fk_time_to_sleep_stream), media_type="text/event-stream", ) else: @@ -70,52 +145,56 @@ async def completion(request: Request): "index": 0, "message": { "role": "assistant", - "content": "\n\nHello there, how may I assist you today?", + "content": fk_reply, }, "logprobs": None, "finish_reason": "stop", } ], - "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21}, + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens + }, } return response -# for completion -@app.post("/completions") -@app.post("/v1/completions") -async def text_completion(request: Request): - data = await request.json() - - if data.get("stream") == True: - return StreamingResponse( - content=data_generator(), - media_type="text/event-stream", - ) - else: - response_id = uuid.uuid4().hex - response = { - "id": "cmpl-9B2ycsf0odECdLmrVzm2y8Q12csjW", - "choices": [ - { - "finish_reason": "length", - "index": 0, - "logprobs": None, - "text": "\n\nA test request, how intriguing\nAn invitation for knowledge bringing\nWith words" - } - ], - "created": 1712420078, - "model": "gpt-3.5-turbo-instruct-0914", - "object": "text_completion", - "system_fingerprint": None, - "usage": { - "completion_tokens": 16, - "prompt_tokens": 10, - "total_tokens": 26 - } - } - - return response +# # for completion +# @app.post("/completions") +# @app.post("/v1/completions") +# async def text_completion(request: Request): +# data = await request.json() + +# if data.get("stream") == True: +# return StreamingResponse( +# content=data_generator(), +# media_type="text/event-stream", +# ) +# else: +# response_id = uuid.uuid4().hex +# response = { +# "id": "cmpl-9B2ycsf0odECdLmrVzm2y8Q12csjW", +# "choices": [ +# { +# "finish_reason": "length", +# "index": 0, +# "logprobs": None, +# "text": "\n\nA test request, how intriguing\nAn invitation for knowledge bringing\nWith words" +# } +# ], +# "created": 1712420078, +# "model": "gpt-3.5-turbo-instruct-0914", +# "object": "text_completion", +# "system_fingerprint": None, +# "usage": { +# "completion_tokens": 16, +# "prompt_tokens": 10, +# "total_tokens": 26 +# } +# } + +# return response @@ -164,4 +243,4 @@ async def embeddings(request: Request): if __name__ == "__main__": import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8090) \ No newline at end of file + uvicorn.run(app, host="0.0.0.0", port=8000)