From 52e0ef495dacd28cd910d4dc54715aa3c1acc242 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 30 Apr 2024 14:28:45 +0800
Subject: [PATCH 01/15] test

---
 main.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/main.py b/main.py
index d5d9b0e..5a9ed56 100644
--- a/main.py
+++ b/main.py
@@ -7,8 +7,6 @@
 import uuid
 import asyncio
 import os
-from dotenv import load_dotenv
-load_dotenv()
 
 app = FastAPI()
 
@@ -164,4 +162,4 @@ async def embeddings(request: Request):
 
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8090)
\ No newline at end of file
+    uvicorn.run(app, host="0.0.0.0", port=8090)

From 0ccbe328698a0951dc59caad28b26ae66c7b3bf6 Mon Sep 17 00:00:00 2001
From: liuhetian <91518757+liuhetian@users.noreply.github.com>
Date: Tue, 30 Apr 2024 14:31:44 +0800
Subject: [PATCH 02/15] Update main.py

---
 main.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/main.py b/main.py
index 5a9ed56..6694d1c 100644
--- a/main.py
+++ b/main.py
@@ -18,37 +18,38 @@
     allow_headers=["*"],
 )
 
+time_to_sleep = 0
 
-def data_generator():
+async def data_generator():
     response_id = uuid.uuid4().hex
     sentence = "Hello this is a test response from a fixed OpenAI endpoint."
     words = sentence.split(" ")
     for word in words:
         word = word + " "
         chunk = {
-                    "id": f"chatcmpl-{response_id}",
-                    "object": "chat.completion.chunk",
-                    "created": 1677652288,
-                    "model": "gpt-3.5-turbo-0125",
-                    "choices": [{"index": 0, "delta": {"content": word}}],
-                }
+            "id": f"chatcmpl-{response_id}",
+            "object": "chat.completion.chunk",
+            "created": 1677652288,
+            "model": "gpt-3.5-turbo-0125",
+            "choices": [{"index": 0, "delta": {"content": word}}],
+        }
         try:
             yield f"data: {json.dumps(chunk.dict())}\n\n"
         except:
             yield f"data: {json.dumps(chunk)}\n\n"
-
+        await asyncio.sleep(1)
 
 # for completion
 @app.post("/chat/completions")
 @app.post("/v1/chat/completions")
 @app.post("/openai/deployments/{model:path}/chat/completions")  # azure compatible endpoint
 async def completion(request: Request):
-    _time_to_sleep = os.getenv("TIME_TO_SLEEP", None)
-    if _time_to_sleep is not None:
-        print("sleeping for " + _time_to_sleep)
-        await asyncio.sleep(float(_time_to_sleep))
+    if time_to_sleep:
+        print("sleeping for " + time_to_sleep)
+        await asyncio.sleep(float(time_to_sleep))
 
     data = await request.json()
+    print(data)
 
     if data.get("stream") == True:
         return StreamingResponse(

From bbe8ac914f6c8873b27eae5a8d5050b60a456ffd Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 30 Apr 2024 14:43:18 +0800
Subject: [PATCH 03/15] fix

---
 main.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/main.py b/main.py
index 6694d1c..9babca7 100644
--- a/main.py
+++ b/main.py
@@ -18,7 +18,8 @@
     allow_headers=["*"],
 )
 
-time_to_sleep = 0
+time_to_sleep = 1
+time_to_sleep_steam = 0.3
 
 async def data_generator():
     response_id = uuid.uuid4().hex
@@ -37,7 +38,8 @@ async def data_generator():
             yield f"data: {json.dumps(chunk.dict())}\n\n"
         except:
             yield f"data: {json.dumps(chunk)}\n\n"
-        await asyncio.sleep(1)
+        if time_to_sleep_steam:
+            await asyncio.sleep(time_to_sleep_stream)
 
 # for completion
 @app.post("/chat/completions")
@@ -45,7 +47,7 @@ async def data_generator():
 @app.post("/openai/deployments/{model:path}/chat/completions")  # azure compatible endpoint
 async def completion(request: Request):
     if time_to_sleep:
-        print("sleeping for " + time_to_sleep)
+        print(f"sleeping for {time_to_sleep}")
         await asyncio.sleep(float(time_to_sleep))
 
     data = await request.json()
@@ -163,4 +165,4 @@ async def embeddings(request: Request):
 
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8090)
+    uvicorn.run(app, host="0.0.0.0", port=8000)

From b6928f333b2d7e2d1871af6dcbc6bd20c942e148 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 30 Apr 2024 17:40:07 +0800
Subject: [PATCH 04/15] fix

---
 main.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/main.py b/main.py
index 9babca7..88b0e11 100644
--- a/main.py
+++ b/main.py
@@ -7,7 +7,7 @@
 import uuid
 import asyncio
 import os
-
+import time
 app = FastAPI()
 
 app.add_middleware(
@@ -19,7 +19,7 @@
 )
 
 time_to_sleep = 1
-time_to_sleep_steam = 0.3
+time_to_sleep_stream = 0.3
 
 async def data_generator():
     response_id = uuid.uuid4().hex
@@ -38,7 +38,7 @@ async def data_generator():
             yield f"data: {json.dumps(chunk.dict())}\n\n"
         except:
             yield f"data: {json.dumps(chunk)}\n\n"
-        if time_to_sleep_steam:
+        if time_to_sleep_stream:
             await asyncio.sleep(time_to_sleep_stream)
 
 # for completion
@@ -47,11 +47,12 @@ async def data_generator():
 @app.post("/openai/deployments/{model:path}/chat/completions")  # azure compatible endpoint
 async def completion(request: Request):
     if time_to_sleep:
-        print(f"sleeping for {time_to_sleep}")
+        # print(f"sleeping for {time_to_sleep}")
         await asyncio.sleep(float(time_to_sleep))
 
     data = await request.json()
-    print(data)
+    #print(data)
+    print(time.time())
 
     if data.get("stream") == True:
         return StreamingResponse(

From d60eb62eec08fa58978e3d7a80138ad8d5208ade Mon Sep 17 00:00:00 2001
From: liuhetian <91518757+liuhetian@users.noreply.github.com>
Date: Tue, 30 Apr 2024 17:43:02 +0800
Subject: [PATCH 05/15] Update main.py

---
 main.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/main.py b/main.py
index 88b0e11..2060f24 100644
--- a/main.py
+++ b/main.py
@@ -2,12 +2,14 @@
 from fastapi.responses import StreamingResponse
 from fastapi.security import OAuth2PasswordBearer
 from fastapi.middleware.cors import CORSMiddleware
+from loguru import logger
 import asyncio
 import json
 import uuid
 import asyncio
 import os
 import time
+
 app = FastAPI()
 
 app.add_middleware(
@@ -21,6 +23,19 @@
 time_to_sleep = 1
 time_to_sleep_stream = 0.3
 
+logger.add('error.log', level=40)
+
+@app.exception_handler(Exception)
+async def custom_exception_handler(request: Request, exc: Exception):
+    # 记录异常信息
+    logger.error("Uncaught exception: {0}".format(str(exc)))
+    # 返回通用异常响应
+    return JSONResponse(
+        status_code=500,
+        content={"message": "An unexpected error occurred"},
+    )
+
+
 async def data_generator():
     response_id = uuid.uuid4().hex
     sentence = "Hello this is a test response from a fixed OpenAI endpoint."

From 9f12466bd220c0ad6ebbadf380051caa582a649a Mon Sep 17 00:00:00 2001
From: liuhetian <91518757+liuhetian@users.noreply.github.com>
Date: Tue, 30 Apr 2024 17:58:20 +0800
Subject: [PATCH 06/15] Update main.py

---
 main.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/main.py b/main.py
index 2060f24..d2207fa 100644
--- a/main.py
+++ b/main.py
@@ -2,6 +2,7 @@
 from fastapi.responses import StreamingResponse
 from fastapi.security import OAuth2PasswordBearer
 from fastapi.middleware.cors import CORSMiddleware
+from starlette.responses import JSONResponse
 from loguru import logger
 import asyncio
 import json

From 10445167c95839ae1334edf8d4f7dc6dda1066d7 Mon Sep 17 00:00:00 2001
From: liuhetian <91518757+liuhetian@users.noreply.github.com>
Date: Tue, 30 Apr 2024 19:45:12 +0800
Subject: [PATCH 07/15] Update main.py

---
 main.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/main.py b/main.py
index d2207fa..fd0f21e 100644
--- a/main.py
+++ b/main.py
@@ -10,6 +10,11 @@
 import asyncio
 import os
 import time
+from langchain.text_splitter import TokenTextSplitter
+
+
+
+
 
 app = FastAPI()
 
@@ -39,10 +44,14 @@ async def custom_exception_handler(request: Request, exc: Exception):
 
 async def data_generator():
     response_id = uuid.uuid4().hex
-    sentence = "Hello this is a test response from a fixed OpenAI endpoint."
+    sentence = "Hello this is a test response from a fixed OpenAI endpoint. " * 5
     words = sentence.split(" ")
+
+    a = TokenTextSplitter(model_name="gpt-3.5-turbo", chunk_size=1, chunk_overlap=0)
+    words = a.split_text(sentence)
+    
     for word in words:
-        word = word + " "
+        word = word
         chunk = {
             "id": f"chatcmpl-{response_id}",
             "object": "chat.completion.chunk",

From 9c938e137b1f33bd4dff464fe9b05ecd31ec9c3c Mon Sep 17 00:00:00 2001
From: liuhetian <91518757+liuhetian@users.noreply.github.com>
Date: Mon, 6 May 2024 09:59:20 +0800
Subject: [PATCH 08/15] Update main.py

---
 main.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/main.py b/main.py
index fd0f21e..bf1e968 100644
--- a/main.py
+++ b/main.py
@@ -10,6 +10,7 @@
 import asyncio
 import os
 import time
+import tiktoken
 from langchain.text_splitter import TokenTextSplitter
 
 
@@ -44,14 +45,15 @@ async def custom_exception_handler(request: Request, exc: Exception):
 
 async def data_generator():
     response_id = uuid.uuid4().hex
-    sentence = "Hello this is a test response from a fixed OpenAI endpoint. " * 5
-    words = sentence.split(" ")
+    sentence = "花香蕉的钱，只能请到猴子. " * 5
 
-    a = TokenTextSplitter(model_name="gpt-3.5-turbo", chunk_size=1, chunk_overlap=0)
-    words = a.split_text(sentence)
+    # a = TokenTextSplitter(model_name="gpt-3.5-turbo", chunk_size=1, chunk_overlap=0)
+    # words = a.split_text(sentence)
+    encoding = tiktoken.get_encoding("cl100k_base")
+    token_integers = encoding.encode(sentence)
+    words = [encoding.decode_single_token_bytes(token) for token in token_integers]
     
     for word in words:
-        word = word
         chunk = {
             "id": f"chatcmpl-{response_id}",
             "object": "chat.completion.chunk",

From 5a7ca33b0a83d0ef1952dac8cc3487fac285246e Mon Sep 17 00:00:00 2001
From: liuhetian <91518757+liuhetian@users.noreply.github.com>
Date: Mon, 6 May 2024 10:08:57 +0800
Subject: [PATCH 09/15] Update main.py

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index bf1e968..5aa6571 100644
--- a/main.py
+++ b/main.py
@@ -62,7 +62,7 @@ async def data_generator():
             "choices": [{"index": 0, "delta": {"content": word}}],
         }
         try:
-            yield f"data: {json.dumps(chunk.dict())}\n\n"
+            yield f"data: {json.dumps(chunk)}\n\n"
         except:
             yield f"data: {json.dumps(chunk)}\n\n"
         if time_to_sleep_stream:

From 827bef74283d5d6338396c511af4a7e74e649a16 Mon Sep 17 00:00:00 2001
From: liuhetian <91518757+liuhetian@users.noreply.github.com>
Date: Mon, 6 May 2024 15:08:44 +0800
Subject: [PATCH 10/15] Update main.py

---
 main.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/main.py b/main.py
index 5aa6571..e5fa745 100644
--- a/main.py
+++ b/main.py
@@ -42,6 +42,22 @@ async def custom_exception_handler(request: Request, exc: Exception):
         content={"message": "An unexpected error occurred"},
     )
 
+def fix_incomplete_utf8(words):
+    combined = bytearray()
+    fixed_words = []
+    for word in words:
+        try:
+            combined.extend(word)
+            # 尝试解码来检查是否是完整的UTF-8字符
+            combined.decode('utf-8')
+            fixed_words.append(bytes(combined))
+            combined.clear()
+        except UnicodeDecodeError:
+            continue  # 如果抛出解码错误，继续添加字节直到可以解码为止
+    if combined:
+        fixed_words.append(bytes(combined))  # 添加最后的字节序列
+    return fixed_words
+    
 
 async def data_generator():
     response_id = uuid.uuid4().hex
@@ -52,19 +68,16 @@ async def data_generator():
     encoding = tiktoken.get_encoding("cl100k_base")
     token_integers = encoding.encode(sentence)
     words = [encoding.decode_single_token_bytes(token) for token in token_integers]
-    
-    for word in words:
+    fixed_words = fix_incomplete_utf8(words)
+    for word in fixed_words:
         chunk = {
             "id": f"chatcmpl-{response_id}",
             "object": "chat.completion.chunk",
             "created": 1677652288,
             "model": "gpt-3.5-turbo-0125",
-            "choices": [{"index": 0, "delta": {"content": word}}],
+            "choices": [{"index": 0, "delta": {"content": word.decode('utf-8')}}],
         }
-        try:
-            yield f"data: {json.dumps(chunk)}\n\n"
-        except:
-            yield f"data: {json.dumps(chunk)}\n\n"
+        yield f"data: {json.dumps(chunk)}\n\n"
         if time_to_sleep_stream:
             await asyncio.sleep(time_to_sleep_stream)
 

From a1acafcae0eab27d4f6a93420ec11de45472503a Mon Sep 17 00:00:00 2001
From: liuhetian <91518757+liuhetian@users.noreply.github.com>
Date: Mon, 6 May 2024 15:12:38 +0800
Subject: [PATCH 11/15] Update main.py

---
 main.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/main.py b/main.py
index e5fa745..9073089 100644
--- a/main.py
+++ b/main.py
@@ -11,11 +11,7 @@
 import os
 import time
 import tiktoken
-from langchain.text_splitter import TokenTextSplitter
-
-
-
-
+# from langchain.text_splitter import TokenTextSplitter
 
 app = FastAPI()
 
@@ -62,7 +58,6 @@ def fix_incomplete_utf8(words):
 async def data_generator():
     response_id = uuid.uuid4().hex
     sentence = "花香蕉的钱，只能请到猴子. " * 5
-
     # a = TokenTextSplitter(model_name="gpt-3.5-turbo", chunk_size=1, chunk_overlap=0)
     # words = a.split_text(sentence)
     encoding = tiktoken.get_encoding("cl100k_base")

From 5b0570442e54ec24abd30e18e9ed29b41729f5d4 Mon Sep 17 00:00:00 2001
From: liuhetian <91518757+liuhetian@users.noreply.github.com>
Date: Mon, 6 May 2024 15:12:55 +0800
Subject: [PATCH 12/15] Update main.py

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 9073089..6636e96 100644
--- a/main.py
+++ b/main.py
@@ -24,7 +24,7 @@
 )
 
 time_to_sleep = 1
-time_to_sleep_stream = 0.3
+time_to_sleep_stream = 2
 
 logger.add('error.log', level=40)
 

From 2447fba8ad6fb90a2a7438b0e212a8d51855135b Mon Sep 17 00:00:00 2001
From: liuhetian <91518757+liuhetian@users.noreply.github.com>
Date: Mon, 6 May 2024 15:21:51 +0800
Subject: [PATCH 13/15] Update main.py

---
 main.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/main.py b/main.py
index 6636e96..87d019b 100644
--- a/main.py
+++ b/main.py
@@ -55,9 +55,8 @@ def fix_incomplete_utf8(words):
     return fixed_words
     
 
-async def data_generator():
+async def data_generator(sentence = "花香蕉的钱，只能请到猴子. " * 5):
     response_id = uuid.uuid4().hex
-    sentence = "花香蕉的钱，只能请到猴子. " * 5
     # a = TokenTextSplitter(model_name="gpt-3.5-turbo", chunk_size=1, chunk_overlap=0)
     # words = a.split_text(sentence)
     encoding = tiktoken.get_encoding("cl100k_base")
@@ -91,7 +90,7 @@ async def completion(request: Request):
 
     if data.get("stream") == True:
         return StreamingResponse(
-            content=data_generator(),
+            content=data_generator(data['messages'][0]['content']),
             media_type="text/event-stream",
         )
     else:

From b969182b2f66e8544e849a4415ec71370f6e1e09 Mon Sep 17 00:00:00 2001
From: liuhetian <91518757+liuhetian@users.noreply.github.com>
Date: Wed, 8 May 2024 12:36:52 +0800
Subject: [PATCH 14/15] Update README.md

---
 README.md | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/README.md b/README.md
index 517fa55..0db80e6 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,46 @@
 # example_openai_endpoint
 An example OpenAI /chat/completions endpoint 
+
+## use
+you can use my endpoint
+
+### openai
+
+```python
+import os
+from openai import OpenAI
+
+client = OpenAI(
+    base_url='http://fakeapi.liuhetian.work/v1',
+    api_key='sk-anytokenisok',
+)
+
+chat_completion = client.chat.completions.create(
+    messages=[
+        {
+            "role": "user",
+            "content": "Say this is a test",
+        }
+    ],
+    model="gpt-3.5-turbo",
+)
+print(chat_completion.choices[0].message.content)
+
+```
+
+### langchain
+``` python
+from langchain_openai import ChatOpenAI
+
+llm = ChatOpenAI(
+    base_url='http://fakeapi.liuhetian.work/v1',
+    api_key='sk-anytokenisok', 
+    model='fake-openai',
+    max_retries=0,
+)
+
+for i in llm.stream("the endpoint will only repeat what I say token by token(not words), Neurocomputational Multidimensionality"):
+    print(i.content, end='')
+
+llm.invoke('Hello World')
+```

From a5c43be631db81b9121902f74739ab1c1618da85 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 8 May 2024 14:56:42 +0800
Subject: [PATCH 15/15] =?UTF-8?q?=E6=8C=87=E5=AE=9Abug=EF=BC=8C=E6=8C=87?=
 =?UTF-8?q?=E5=AE=9A=E5=9B=9E=E5=A4=8D=E5=86=85=E5=AE=B9=EF=BC=8C=E6=8C=87?=
 =?UTF-8?q?=E5=AE=9A=E5=93=8D=E5=BA=94=E6=97=B6=E9=97=B4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md |  99 +++++++++++++++++++++++++++++++++++----
 error.log |  13 ++++++
 main.py   | 137 +++++++++++++++++++++++++++++++++++-------------------
 3 files changed, 194 insertions(+), 55 deletions(-)
 create mode 100644 error.log

diff --git a/README.md b/README.md
index 0db80e6..ebea899 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 # example_openai_endpoint
-An example OpenAI /chat/completions endpoint 
+An example OpenAI /chat/completions endpoint, can simulate some exceptions that occur
 
-## use
-you can use my endpoint
+## Quickstart
+just use my endpoint
 
 ### openai
 
@@ -12,9 +12,9 @@ from openai import OpenAI
 
 client = OpenAI(
     base_url='http://fakeapi.liuhetian.work/v1',
-    api_key='sk-anytokenisok',
+    api_key='sk-anythingisok',  # doesn't matter
+    max_retries=0,
 )
-
 chat_completion = client.chat.completions.create(
     messages=[
         {
@@ -25,7 +25,6 @@ chat_completion = client.chat.completions.create(
     model="gpt-3.5-turbo",
 )
 print(chat_completion.choices[0].message.content)
-
 ```
 
 ### langchain
@@ -34,8 +33,8 @@ from langchain_openai import ChatOpenAI
 
 llm = ChatOpenAI(
     base_url='http://fakeapi.liuhetian.work/v1',
-    api_key='sk-anytokenisok', 
-    model='fake-openai',
+    api_key='sk-anythingisok', 
+    model='fake-openai',  # doesn't matter
     max_retries=0,
 )
 
@@ -44,3 +43,87 @@ for i in llm.stream("the endpoint will only repeat what I say token by token(not
 
 llm.invoke('Hello World')
 ```
+
+## Customize response
+
+### Customize reply
+you may want to customize the reply of llm,\n\nthis endpoint will repeat the first sentence you say without customize
+```python
+from openai import OpenAI
+import openai
+
+client = OpenAI(
+    base_url='http://fakeapi.liuhetian.work/v1',
+    api_key='sk-anytokenisok',
+    max_retries=0,
+)
+chat_completion = client.chat.completions.create(
+    messages = [
+        {
+            "role": "user",
+            "content": "Say this is a test",
+        }
+    ],
+    model="gpt-3.5-turbo",
+    extra_body={'fk_reply': 'this is a test'}
+)
+print(chat_completion.choices[0].message.content)
+```
+
+
+### Customize error
+If you want to simulate some exceptions that may occur, you can add a param `extra_body={'fk_error': 429}`
+
+```python
+from openai import OpenAI
+import openai
+
+client = OpenAI(
+    base_url='http://fakeapi.liuhetian.work/v1',
+    api_key='sk-anytokenisok',
+    max_retries=0,
+
+)
+try:
+    chat_completion = client.chat.completions.create(
+        messages = [
+            {
+                "role": "user",
+                "content": "Say this is a test",
+            }
+        ],
+        model="gpt-3.5-turbo",
+        extra_body={'fk_error': 429}  # 500
+    )
+except openai.APIConnectionError as e:
+    print("The server could not be reached")
+    print(e.__cause__)  # an underlying Exception, likely raised within httpx.
+except openai.RateLimitError as e:
+    print("A 429 status code was received; we should back off a bit.")
+except openai.APIStatusError as e:
+    print("Another non-200-range status code was received")
+    print(e.status_code)
+    print(e.response)
+```
+
+### Customized response speed
+
+Use stream to return results according to a single token. By setting parameter a, you can clearly see the content of each token.
+
+```python
+from langchain_openai import ChatOpenAI
+llm = ChatOpenAI(
+    base_url='http://fakeapi.liuhetian.work/v1',
+    api_key='sk-anytokenisok', 
+    model='fake-openai',
+    max_retries=0,
+)
+for i in llm.bind(extra_body={'fk_time_to_sleep': 0, 'fk_time_to_sleep_stream': 2}).stream(
+    "the endpoint will only repeat what I say token by token(not words),"
+    " for example: Neurocomputational Multidimensionality"
+):
+    print(i.content, end='')
+
+# fk_time_to_sleep is first token response time
+# fk_time_to_sleep_stream if response time after previous token
+```
\ No newline at end of file
diff --git a/error.log b/error.log
new file mode 100644
index 0000000..433dcc8
--- /dev/null
+++ b/error.log
@@ -0,0 +1,13 @@
+2024-05-08 13:57:40.905 | ERROR    | __main__:custom_exception_handler:34 - Uncaught exception: division by zero
+2024-05-08 13:57:42.331 | ERROR    | __main__:custom_exception_handler:34 - Uncaught exception: division by zero
+2024-05-08 13:57:44.803 | ERROR    | __main__:custom_exception_handler:34 - Uncaught exception: division by zero
+2024-05-08 14:01:54.818 | ERROR    | __main__:custom_exception_handler:34 - Uncaught exception: division by zero
+2024-05-08 14:03:38.980 | ERROR    | __main__:custom_exception_handler:34 - Uncaught exception: division by zero
+2024-05-08 14:08:42.010 | ERROR    | __main__:custom_exception_handler:34 - Uncaught exception: division by zero
+2024-05-08 14:13:19.102 | ERROR    | __main__:custom_exception_handler:44 - Uncaught exception: Fake Internal Server Error
+2024-05-08 14:13:26.087 | ERROR    | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit.
+2024-05-08 14:15:07.260 | ERROR    | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit.
+2024-05-08 14:15:32.304 | ERROR    | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit.
+2024-05-08 14:15:44.404 | ERROR    | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit.
+2024-05-08 14:16:44.675 | ERROR    | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit.
+2024-05-08 14:48:04.068 | ERROR    | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit.
diff --git a/main.py b/main.py
index 87d019b..cccfffe 100644
--- a/main.py
+++ b/main.py
@@ -23,11 +23,21 @@
     allow_headers=["*"],
 )
 
-time_to_sleep = 1
-time_to_sleep_stream = 2
+class RateLimitError(Exception):
+    ...
 
 logger.add('error.log', level=40)
 
+@app.exception_handler(RateLimitError)
+async def rl_exception_handler(request: Request, exc: RateLimitError):
+    # 记录异常信息
+    logger.error("Uncaught exception: {0}".format(str(exc)))
+    # 返回通用异常响应
+    return JSONResponse(
+        status_code=429,
+        content={"message": str(exc)},
+    )
+
 @app.exception_handler(Exception)
 async def custom_exception_handler(request: Request, exc: Exception):
     # 记录异常信息
@@ -35,7 +45,7 @@ async def custom_exception_handler(request: Request, exc: Exception):
     # 返回通用异常响应
     return JSONResponse(
         status_code=500,
-        content={"message": "An unexpected error occurred"},
+        content={"message": str(exc)},
     )
 
 def fix_incomplete_utf8(words):
@@ -53,9 +63,28 @@ def fix_incomplete_utf8(words):
     if combined:
         fixed_words.append(bytes(combined))  # 添加最后的字节序列
     return fixed_words
+
+def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0125"):
+    """Returns the number of tokens used by a list of messages."""
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        encoding = tiktoken.get_encoding("cl100k_base")
+    if model == "gpt-3.5-turbo-0125":  # note: future models may deviate from this
+        num_tokens = 0
+        for message in messages:
+            num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
+            for key, value in message.items():
+                num_tokens += len(encoding.encode(value))
+                if key == "name":  # if there's a name, the role is omitted
+                    num_tokens += -1  # role is always required and always 1 token
+        num_tokens += 2  # every reply is primed with <im_start>assistant
+        return num_tokens
+    else:
+        raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}.""")
     
 
-async def data_generator(sentence = "花香蕉的钱，只能请到猴子. " * 5):
+async def data_generator(sentence="花香蕉的钱，只能请到猴子. ", time_to_sleep_stream=2):
     response_id = uuid.uuid4().hex
     # a = TokenTextSplitter(model_name="gpt-3.5-turbo", chunk_size=1, chunk_overlap=0)
     # words = a.split_text(sentence)
@@ -80,17 +109,27 @@ async def data_generator(sentence = "花香蕉的钱，只能请到猴子. " * 5
 @app.post("/v1/chat/completions")
 @app.post("/openai/deployments/{model:path}/chat/completions")  # azure compatible endpoint
 async def completion(request: Request):
-    if time_to_sleep:
-        # print(f"sleeping for {time_to_sleep}")
-        await asyncio.sleep(float(time_to_sleep))
 
     data = await request.json()
-    #print(data)
-    print(time.time())
+    print(data)
+    await asyncio.sleep(float(data.get('fk_time_to_sleep', 0.1)))
+    
+    fk_error = data.get('fk_error')
+    if fk_error == 500:
+        raise ValueError('Fake Internal Server Error')
+    if fk_error == 429:
+        raise RateLimitError("A 429 status code was received; we should back off a bit.")
+
+    fk_reply = data.get('fk_reply', data['messages'][0]['content']) or 'You is my friend!'
+    prompt_tokens = num_tokens_from_messages(data['messages'])
+    encoding = tiktoken.get_encoding("cl100k_base")
+    completion_tokens = len(encoding.encode(fk_reply))
+    total_tokens = prompt_tokens + completion_tokens
 
     if data.get("stream") == True:
+        fk_time_to_sleep_stream = float(data.get('fk_time_to_sleep_stream', 0.1))
         return StreamingResponse(
-            content=data_generator(data['messages'][0]['content']),
+            content=data_generator(fk_reply, fk_time_to_sleep_stream),
             media_type="text/event-stream",
         )
     else:
@@ -106,52 +145,56 @@ async def completion(request: Request):
                     "index": 0,
                     "message": {
                         "role": "assistant",
-                        "content": "\n\nHello there, how may I assist you today?",
+                        "content": fk_reply,
                     },
                     "logprobs": None,
                     "finish_reason": "stop",
                 }
             ],
-            "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21},
+            "usage": {
+                "prompt_tokens": prompt_tokens, 
+                "completion_tokens": completion_tokens, 
+                "total_tokens": total_tokens
+            },
         }
         return response
 
 
-# for completion
-@app.post("/completions")
-@app.post("/v1/completions")
-async def text_completion(request: Request):
-    data = await request.json()
-
-    if data.get("stream") == True:
-        return StreamingResponse(
-            content=data_generator(),
-            media_type="text/event-stream",
-        )
-    else:
-        response_id = uuid.uuid4().hex
-        response = {
-            "id": "cmpl-9B2ycsf0odECdLmrVzm2y8Q12csjW",
-            "choices": [
-                {
-                "finish_reason": "length",
-                "index": 0,
-                "logprobs": None,
-                "text": "\n\nA test request, how intriguing\nAn invitation for knowledge bringing\nWith words"
-                }
-            ],
-            "created": 1712420078,
-            "model": "gpt-3.5-turbo-instruct-0914",
-            "object": "text_completion",
-            "system_fingerprint": None,
-            "usage": {
-                "completion_tokens": 16,
-                "prompt_tokens": 10,
-                "total_tokens": 26
-            }
-        }
-
-        return response
+# # for completion
+# @app.post("/completions")
+# @app.post("/v1/completions")
+# async def text_completion(request: Request):
+#     data = await request.json()
+
+#     if data.get("stream") == True:
+#         return StreamingResponse(
+#             content=data_generator(),
+#             media_type="text/event-stream",
+#         )
+#     else:
+#         response_id = uuid.uuid4().hex
+#         response = {
+#             "id": "cmpl-9B2ycsf0odECdLmrVzm2y8Q12csjW",
+#             "choices": [
+#                 {
+#                 "finish_reason": "length",
+#                 "index": 0,
+#                 "logprobs": None,
+#                 "text": "\n\nA test request, how intriguing\nAn invitation for knowledge bringing\nWith words"
+#                 }
+#             ],
+#             "created": 1712420078,
+#             "model": "gpt-3.5-turbo-instruct-0914",
+#             "object": "text_completion",
+#             "system_fingerprint": None,
+#             "usage": {
+#                 "completion_tokens": 16,
+#                 "prompt_tokens": 10,
+#                 "total_tokens": 26
+#             }
+#         }
+
+#         return response