diff --git a/README.md b/README.md
index 517fa55..ebea899 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,129 @@
 # example_openai_endpoint
-An example OpenAI /chat/completions endpoint 
+An example OpenAI /chat/completions endpoint, can simulate some exceptions that occur
+
+## Quickstart
+just use my endpoint
+
+### openai
+
+```python
+import os
+from openai import OpenAI
+
+client = OpenAI(
+    base_url='http://fakeapi.liuhetian.work/v1',
+    api_key='sk-anythingisok',  # doesn't matter
+    max_retries=0,
+)
+chat_completion = client.chat.completions.create(
+    messages=[
+        {
+            "role": "user",
+            "content": "Say this is a test",
+        }
+    ],
+    model="gpt-3.5-turbo",
+)
+print(chat_completion.choices[0].message.content)
+```
+
+### langchain
+``` python
+from langchain_openai import ChatOpenAI
+
+llm = ChatOpenAI(
+    base_url='http://fakeapi.liuhetian.work/v1',
+    api_key='sk-anythingisok', 
+    model='fake-openai',  # doesn't matter
+    max_retries=0,
+)
+
+for i in llm.stream("the endpoint will only repeat what I say token by token(not words), Neurocomputational Multidimensionality"):
+    print(i.content, end='')
+
+llm.invoke('Hello World')
+```
+
+## Customize response
+
+### Customize reply
+you may want to customize the reply of llm,\n\nthis endpoint will repeat the first sentence you say without customize
+```python
+from openai import OpenAI
+import openai
+
+client = OpenAI(
+    base_url='http://fakeapi.liuhetian.work/v1',
+    api_key='sk-anytokenisok',
+    max_retries=0,
+)
+chat_completion = client.chat.completions.create(
+    messages = [
+        {
+            "role": "user",
+            "content": "Say this is a test",
+        }
+    ],
+    model="gpt-3.5-turbo",
+    extra_body={'fk_reply': 'this is a test'}
+)
+print(chat_completion.choices[0].message.content)
+```
+
+
+### Customize error
+If you want to simulate some exceptions that may occur, you can add a param `extra_body={'fk_error': 429}`
+
+```python
+from openai import OpenAI
+import openai
+
+client = OpenAI(
+    base_url='http://fakeapi.liuhetian.work/v1',
+    api_key='sk-anytokenisok',
+    max_retries=0,
+
+)
+try:
+    chat_completion = client.chat.completions.create(
+        messages = [
+            {
+                "role": "user",
+                "content": "Say this is a test",
+            }
+        ],
+        model="gpt-3.5-turbo",
+        extra_body={'fk_error': 429}  # 500
+    )
+except openai.APIConnectionError as e:
+    print("The server could not be reached")
+    print(e.__cause__)  # an underlying Exception, likely raised within httpx.
+except openai.RateLimitError as e:
+    print("A 429 status code was received; we should back off a bit.")
+except openai.APIStatusError as e:
+    print("Another non-200-range status code was received")
+    print(e.status_code)
+    print(e.response)
+```
+
+### Customized response speed
+
+Use stream to return results according to a single token. By setting parameter a, you can clearly see the content of each token.
+
+```python
+from langchain_openai import ChatOpenAI
+llm = ChatOpenAI(
+    base_url='http://fakeapi.liuhetian.work/v1',
+    api_key='sk-anytokenisok', 
+    model='fake-openai',
+    max_retries=0,
+)
+for i in llm.bind(extra_body={'fk_time_to_sleep': 0, 'fk_time_to_sleep_stream': 2}).stream(
+    "the endpoint will only repeat what I say token by token(not words),"
+    " for example: Neurocomputational Multidimensionality"
+):
+    print(i.content, end='')
+
+# fk_time_to_sleep is first token response time
+# fk_time_to_sleep_stream if response time after previous token
+```
\ No newline at end of file
diff --git a/error.log b/error.log
new file mode 100644
index 0000000..433dcc8
--- /dev/null
+++ b/error.log
@@ -0,0 +1,13 @@
+2024-05-08 13:57:40.905 | ERROR    | __main__:custom_exception_handler:34 - Uncaught exception: division by zero
+2024-05-08 13:57:42.331 | ERROR    | __main__:custom_exception_handler:34 - Uncaught exception: division by zero
+2024-05-08 13:57:44.803 | ERROR    | __main__:custom_exception_handler:34 - Uncaught exception: division by zero
+2024-05-08 14:01:54.818 | ERROR    | __main__:custom_exception_handler:34 - Uncaught exception: division by zero
+2024-05-08 14:03:38.980 | ERROR    | __main__:custom_exception_handler:34 - Uncaught exception: division by zero
+2024-05-08 14:08:42.010 | ERROR    | __main__:custom_exception_handler:34 - Uncaught exception: division by zero
+2024-05-08 14:13:19.102 | ERROR    | __main__:custom_exception_handler:44 - Uncaught exception: Fake Internal Server Error
+2024-05-08 14:13:26.087 | ERROR    | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit.
+2024-05-08 14:15:07.260 | ERROR    | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit.
+2024-05-08 14:15:32.304 | ERROR    | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit.
+2024-05-08 14:15:44.404 | ERROR    | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit.
+2024-05-08 14:16:44.675 | ERROR    | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit.
+2024-05-08 14:48:04.068 | ERROR    | __main__:rl_exception_handler:34 - Uncaught exception: A 429 status code was received; we should back off a bit.
diff --git a/main.py b/main.py
index d5d9b0e..cccfffe 100644
--- a/main.py
+++ b/main.py
@@ -2,13 +2,16 @@
 from fastapi.responses import StreamingResponse
 from fastapi.security import OAuth2PasswordBearer
 from fastapi.middleware.cors import CORSMiddleware
+from starlette.responses import JSONResponse
+from loguru import logger
 import asyncio
 import json
 import uuid
 import asyncio
 import os
-from dotenv import load_dotenv
-load_dotenv()
+import time
+import tiktoken
+# from langchain.text_splitter import TokenTextSplitter
 
 app = FastAPI()
 
@@ -20,41 +23,113 @@
     allow_headers=["*"],
 )
 
-
-def data_generator():
-    response_id = uuid.uuid4().hex
-    sentence = "Hello this is a test response from a fixed OpenAI endpoint."
-    words = sentence.split(" ")
+class RateLimitError(Exception):
+    ...
+
+logger.add('error.log', level=40)
+
+@app.exception_handler(RateLimitError)
+async def rl_exception_handler(request: Request, exc: RateLimitError):
+    # 记录异常信息
+    logger.error("Uncaught exception: {0}".format(str(exc)))
+    # 返回通用异常响应
+    return JSONResponse(
+        status_code=429,
+        content={"message": str(exc)},
+    )
+
+@app.exception_handler(Exception)
+async def custom_exception_handler(request: Request, exc: Exception):
+    # 记录异常信息
+    logger.error("Uncaught exception: {0}".format(str(exc)))
+    # 返回通用异常响应
+    return JSONResponse(
+        status_code=500,
+        content={"message": str(exc)},
+    )
+
+def fix_incomplete_utf8(words):
+    combined = bytearray()
+    fixed_words = []
     for word in words:
-        word = word + " "
-        chunk = {
-                    "id": f"chatcmpl-{response_id}",
-                    "object": "chat.completion.chunk",
-                    "created": 1677652288,
-                    "model": "gpt-3.5-turbo-0125",
-                    "choices": [{"index": 0, "delta": {"content": word}}],
-                }
         try:
-            yield f"data: {json.dumps(chunk.dict())}\n\n"
-        except:
-            yield f"data: {json.dumps(chunk)}\n\n"
+            combined.extend(word)
+            # 尝试解码来检查是否是完整的UTF-8字符
+            combined.decode('utf-8')
+            fixed_words.append(bytes(combined))
+            combined.clear()
+        except UnicodeDecodeError:
+            continue  # 如果抛出解码错误，继续添加字节直到可以解码为止
+    if combined:
+        fixed_words.append(bytes(combined))  # 添加最后的字节序列
+    return fixed_words
+
+def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0125"):
+    """Returns the number of tokens used by a list of messages."""
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        encoding = tiktoken.get_encoding("cl100k_base")
+    if model == "gpt-3.5-turbo-0125":  # note: future models may deviate from this
+        num_tokens = 0
+        for message in messages:
+            num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
+            for key, value in message.items():
+                num_tokens += len(encoding.encode(value))
+                if key == "name":  # if there's a name, the role is omitted
+                    num_tokens += -1  # role is always required and always 1 token
+        num_tokens += 2  # every reply is primed with <im_start>assistant
+        return num_tokens
+    else:
+        raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}.""")
+    
 
+async def data_generator(sentence="花香蕉的钱，只能请到猴子. ", time_to_sleep_stream=2):
+    response_id = uuid.uuid4().hex
+    # a = TokenTextSplitter(model_name="gpt-3.5-turbo", chunk_size=1, chunk_overlap=0)
+    # words = a.split_text(sentence)
+    encoding = tiktoken.get_encoding("cl100k_base")
+    token_integers = encoding.encode(sentence)
+    words = [encoding.decode_single_token_bytes(token) for token in token_integers]
+    fixed_words = fix_incomplete_utf8(words)
+    for word in fixed_words:
+        chunk = {
+            "id": f"chatcmpl-{response_id}",
+            "object": "chat.completion.chunk",
+            "created": 1677652288,
+            "model": "gpt-3.5-turbo-0125",
+            "choices": [{"index": 0, "delta": {"content": word.decode('utf-8')}}],
+        }
+        yield f"data: {json.dumps(chunk)}\n\n"
+        if time_to_sleep_stream:
+            await asyncio.sleep(time_to_sleep_stream)
 
 # for completion
 @app.post("/chat/completions")
 @app.post("/v1/chat/completions")
 @app.post("/openai/deployments/{model:path}/chat/completions")  # azure compatible endpoint
 async def completion(request: Request):
-    _time_to_sleep = os.getenv("TIME_TO_SLEEP", None)
-    if _time_to_sleep is not None:
-        print("sleeping for " + _time_to_sleep)
-        await asyncio.sleep(float(_time_to_sleep))
 
     data = await request.json()
+    print(data)
+    await asyncio.sleep(float(data.get('fk_time_to_sleep', 0.1)))
+    
+    fk_error = data.get('fk_error')
+    if fk_error == 500:
+        raise ValueError('Fake Internal Server Error')
+    if fk_error == 429:
+        raise RateLimitError("A 429 status code was received; we should back off a bit.")
+
+    fk_reply = data.get('fk_reply', data['messages'][0]['content']) or 'You is my friend!'
+    prompt_tokens = num_tokens_from_messages(data['messages'])
+    encoding = tiktoken.get_encoding("cl100k_base")
+    completion_tokens = len(encoding.encode(fk_reply))
+    total_tokens = prompt_tokens + completion_tokens
 
     if data.get("stream") == True:
+        fk_time_to_sleep_stream = float(data.get('fk_time_to_sleep_stream', 0.1))
         return StreamingResponse(
-            content=data_generator(),
+            content=data_generator(fk_reply, fk_time_to_sleep_stream),
             media_type="text/event-stream",
         )
     else:
@@ -70,52 +145,56 @@ async def completion(request: Request):
                     "index": 0,
                     "message": {
                         "role": "assistant",
-                        "content": "\n\nHello there, how may I assist you today?",
+                        "content": fk_reply,
                     },
                     "logprobs": None,
                     "finish_reason": "stop",
                 }
             ],
-            "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21},
+            "usage": {
+                "prompt_tokens": prompt_tokens, 
+                "completion_tokens": completion_tokens, 
+                "total_tokens": total_tokens
+            },
         }
         return response
 
 
-# for completion
-@app.post("/completions")
-@app.post("/v1/completions")
-async def text_completion(request: Request):
-    data = await request.json()
-
-    if data.get("stream") == True:
-        return StreamingResponse(
-            content=data_generator(),
-            media_type="text/event-stream",
-        )
-    else:
-        response_id = uuid.uuid4().hex
-        response = {
-            "id": "cmpl-9B2ycsf0odECdLmrVzm2y8Q12csjW",
-            "choices": [
-                {
-                "finish_reason": "length",
-                "index": 0,
-                "logprobs": None,
-                "text": "\n\nA test request, how intriguing\nAn invitation for knowledge bringing\nWith words"
-                }
-            ],
-            "created": 1712420078,
-            "model": "gpt-3.5-turbo-instruct-0914",
-            "object": "text_completion",
-            "system_fingerprint": None,
-            "usage": {
-                "completion_tokens": 16,
-                "prompt_tokens": 10,
-                "total_tokens": 26
-            }
-        }
-
-        return response
+# # for completion
+# @app.post("/completions")
+# @app.post("/v1/completions")
+# async def text_completion(request: Request):
+#     data = await request.json()
+
+#     if data.get("stream") == True:
+#         return StreamingResponse(
+#             content=data_generator(),
+#             media_type="text/event-stream",
+#         )
+#     else:
+#         response_id = uuid.uuid4().hex
+#         response = {
+#             "id": "cmpl-9B2ycsf0odECdLmrVzm2y8Q12csjW",
+#             "choices": [
+#                 {
+#                 "finish_reason": "length",
+#                 "index": 0,
+#                 "logprobs": None,
+#                 "text": "\n\nA test request, how intriguing\nAn invitation for knowledge bringing\nWith words"
+#                 }
+#             ],
+#             "created": 1712420078,
+#             "model": "gpt-3.5-turbo-instruct-0914",
+#             "object": "text_completion",
+#             "system_fingerprint": None,
+#             "usage": {
+#                 "completion_tokens": 16,
+#                 "prompt_tokens": 10,
+#                 "total_tokens": 26
+#             }
+#         }
+
+#         return response
 
 
 
@@ -164,4 +243,4 @@ async def embeddings(request: Request):
 
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8090)
\ No newline at end of file
+    uvicorn.run(app, host="0.0.0.0", port=8000)