Merge remote-tracking branch 'upstream/main'

keeping up to date with original
exo-explore · Jul 20, 2024 · f554ae0 · f554ae0
2 parents 0222b2e + e49924e
commit f554ae0
Show file tree

Hide file tree

Showing 5 changed files with 138 additions and 15 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -7,7 +7,7 @@ on:
     branches: [ main ]
 
 jobs:
-  test:
+  unit_test:
     runs-on: macos-14
     steps:
     - uses: actions/checkout@v2
@@ -21,3 +21,128 @@ jobs:
         pip install .
     - name: Run tests
       run: python3 -m exo.inference.test_inference_engine
+
+  discovery_integration_test:
+    runs-on: macos-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python3 -m pip install --upgrade pip
+        pip install .
+    - name: Run discovery integration test
+      run: |
+        # Start first instance
+        DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 > output1.log 2>&1 &
+        PID1=$!
+
+        # Start second instance
+        DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 > output2.log 2>&1 &
+        PID2=$!
+
+        # Wait for discovery
+        sleep 10
+
+        # Stop both instances
+        kill $PID1 $PID2
+
+        # Check outputs
+        if grep -q "Connected to peer" output1.log && grep -q "Connected to peer" output2.log; then
+          echo "Test passed: Both instances discovered each other"
+          exit 0
+        else
+          echo "Test failed: Devices did not discover each other"
+          echo "Output of first instance:"
+          cat output1.log
+          echo "Output of second instance:"
+          cat output2.log
+          exit 1
+        fi
+
+  chatgpt_api_integration_test:
+    runs-on: macos-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python3 -m pip install --upgrade pip
+        pip install .
+    - name: Run chatgpt api integration test
+      run: |
+        # Start first instance
+        DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 --chatgpt-api-response-timeout-secs 1200 > output1.log 2>&1 &
+        PID1=$!
+
+        # Start second instance
+        DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 --chatgpt-api-response-timeout-secs 1200 > output2.log 2>&1 &
+        PID2=$!
+
+        # Wait for discovery
+        sleep 10
+
+        # first one to load the model
+        curl -s http://localhost:8000/v1/chat/completions \
+            -H "Content-Type: application/json" \
+            -d '{
+              "model": "llama-3-8b",
+              "messages": [{"role": "user", "content": "Placeholder to load model..."}],
+              "temperature": 0.7
+            }'
+        curl -s http://localhost:8001/v1/chat/completions \
+            -H "Content-Type: application/json" \
+            -d '{
+              "model": "llama-3-8b",
+              "messages": [{"role": "user", "content": "Placeholder to load model..."}],
+              "temperature": 0.7
+            }'
+
+        response_1=$(curl -s http://localhost:8000/v1/chat/completions \
+          -H "Content-Type: application/json" \
+          -d '{
+            "model": "llama-3-8b",
+            "messages": [{"role": "user", "content": "Who was the king of pop?"}],
+            "temperature": 0.7
+          }')
+        echo "Response 1: $response_1"
+
+        response_2=$(curl -s http://localhost:8000/v1/chat/completions \
+          -H "Content-Type: application/json" \
+          -d '{
+            "model": "llama-3-8b",
+            "messages": [{"role": "user", "content": "Who was the king of pop?"}],
+            "temperature": 0.7
+          }')
+        echo "Response 2: $response_2"
+
+        if ! echo "$response_1" | grep -q "Michael Jackson" || ! echo "$response_2" | grep -q "Michael Jackson"; then
+          echo "Test failed: Response does not contain 'Michael Jackson'"
+          echo "Response 1: $response_1"
+          echo "Response 2: $response_2"
+          exit 1
+        else
+          echo "Test passed: Response from both nodes contains 'Michael Jackson'"
+        fi
+
+        # Stop both instances
+        kill $PID1 $PID2
+
+        # Check outputs
+        if grep -q "Connected to peer" output1.log && grep -q "Connected to peer" output2.log; then
+          echo "Test passed: Both instances discovered each other"
+          exit 0
+        else
+          echo "Test failed: Devices did not discover each other"
+          echo "Output of first instance:"
+          cat output1.log
+          echo "Output of second instance:"
+          cat output2.log
+          exit 1
+        fi
diff --git a/README.md b/README.md
@@ -104,20 +104,16 @@ The native way to access models running on exo is using the exo library with pee
 
 exo also starts a ChatGPT-compatible API endpoint on http://localhost:8000. Note: this is currently only supported by tail nodes (i.e. nodes selected to be at the end of the ring topology). Example request:
 
-```
+```sh
 curl http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-     "model": "llama-3-70b",
+     "model": "llama-3-8b",
      "messages": [{"role": "user", "content": "What is the meaning of exo?"}],
      "temperature": 0.7
    }'
 ```
 
-```sh
-curl -X POST http://localhost:8001/api/v1/chat -H "Content-Type: application/json" -d '{"messages": [{"role": "user", "content": "What is the meaning of life?"}]}'
-```
-
 ## Debugging
 
 Enable debug logs with the DEBUG environment variable (0-9).

diff --git a/exo/api/chatgpt_api.py b/exo/api/chatgpt_api.py
@@ -117,10 +117,10 @@ def build_prompt(tokenizer, messages: List[Message]):
 
 
 class ChatGPTAPI:
-    def __init__(self, node: Node, inference_engine_classname: str):
+    def __init__(self, node: Node, inference_engine_classname: str, response_timeout_secs: int = 90):
         self.node = node
         self.inference_engine_classname = inference_engine_classname
-        self.response_timeout_secs = 90
+        self.response_timeout_secs = response_timeout_secs
         self.app = web.Application()
         self.prev_token_lens: Dict[str, int] = {}
         self.stream_tasks: Dict[str, asyncio.Task] = {}

diff --git a/exo/inference/test_inference_engine.py b/exo/inference/test_inference_engine.py
@@ -30,8 +30,9 @@ async def test_inference_engine(inference_engine_1: InferenceEngine, inference_e
     "mlx-community/Meta-Llama-3-8B-Instruct-4bit",
 ))
 
-asyncio.run(test_inference_engine(
-    TinygradDynamicShardInferenceEngine(),
-    TinygradDynamicShardInferenceEngine(),
-    "llama3-8b-sfr",
-))
+# TODO: Waiting on https://github.com/tinygrad/tinygrad/issues/5549
+# asyncio.run(test_inference_engine(
+#     TinygradDynamicShardInferenceEngine(),
+#     TinygradDynamicShardInferenceEngine(),
+#     "llama3-8b-sfr",
+# ))
diff --git a/main.py b/main.py
@@ -22,6 +22,7 @@
 parser.add_argument("--broadcast-port", type=int, default=5678, help="Broadcast port for discovery")
 parser.add_argument("--wait-for-peers", type=int, default=0, help="Number of peers to wait to connect to before starting")
 parser.add_argument("--chatgpt-api-port", type=int, default=8000, help="ChatGPT API port")
+parser.add_argument("--chatgpt-api-response-timeout-secs", type=int, default=90, help="ChatGPT API response timeout in seconds")
 parser.add_argument("--inference-engine", type=str, default=None, help="Inference engine to use")
 args = parser.parse_args()
 
@@ -57,7 +58,7 @@
 node = StandardNode(args.node_id, None, inference_engine, discovery, partitioning_strategy=RingMemoryWeightedPartitioningStrategy(), chatgpt_api_endpoint=f"http://localhost:{args.chatgpt_api_port}/v1/chat/completions", web_chat_url=f"http://localhost:{args.chatgpt_api_port}")
 server = GRPCServer(node, args.node_host, args.node_port)
 node.server = server
-api = ChatGPTAPI(node, inference_engine.__class__.__name__)
+api = ChatGPTAPI(node, inference_engine.__class__.__name__, response_timeout_secs=args.chatgpt_api_response_timeout_secs)
 
 node.on_token.register("main_log").on_next(lambda _, tokens , __: print(inference_engine.tokenizer.decode(tokens) if hasattr(inference_engine, "tokenizer") else tokens))