Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main'
Browse files Browse the repository at this point in the history
keeping up to date with original
  • Loading branch information
itsknk committed Jul 20, 2024
2 parents 0222b2e + e49924e commit f554ae0
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 15 deletions.
127 changes: 126 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:
branches: [ main ]

jobs:
test:
unit_test:
runs-on: macos-14
steps:
- uses: actions/checkout@v2
Expand All @@ -21,3 +21,128 @@ jobs:
pip install .
- name: Run tests
run: python3 -m exo.inference.test_inference_engine

discovery_integration_test:
runs-on: macos-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'
- name: Install dependencies
run: |
python3 -m pip install --upgrade pip
pip install .
- name: Run discovery integration test
run: |
# Start first instance
DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 > output1.log 2>&1 &
PID1=$!
# Start second instance
DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 > output2.log 2>&1 &
PID2=$!
# Wait for discovery
sleep 10
# Stop both instances
kill $PID1 $PID2
# Check outputs
if grep -q "Connected to peer" output1.log && grep -q "Connected to peer" output2.log; then
echo "Test passed: Both instances discovered each other"
exit 0
else
echo "Test failed: Devices did not discover each other"
echo "Output of first instance:"
cat output1.log
echo "Output of second instance:"
cat output2.log
exit 1
fi
chatgpt_api_integration_test:
runs-on: macos-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'
- name: Install dependencies
run: |
python3 -m pip install --upgrade pip
pip install .
- name: Run chatgpt api integration test
run: |
# Start first instance
DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 --chatgpt-api-response-timeout-secs 1200 > output1.log 2>&1 &
PID1=$!
# Start second instance
DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 --chatgpt-api-response-timeout-secs 1200 > output2.log 2>&1 &
PID2=$!
# Wait for discovery
sleep 10
# first one to load the model
curl -s http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama-3-8b",
"messages": [{"role": "user", "content": "Placeholder to load model..."}],
"temperature": 0.7
}'
curl -s http://localhost:8001/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama-3-8b",
"messages": [{"role": "user", "content": "Placeholder to load model..."}],
"temperature": 0.7
}'
response_1=$(curl -s http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama-3-8b",
"messages": [{"role": "user", "content": "Who was the king of pop?"}],
"temperature": 0.7
}')
echo "Response 1: $response_1"
response_2=$(curl -s http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama-3-8b",
"messages": [{"role": "user", "content": "Who was the king of pop?"}],
"temperature": 0.7
}')
echo "Response 2: $response_2"
if ! echo "$response_1" | grep -q "Michael Jackson" || ! echo "$response_2" | grep -q "Michael Jackson"; then
echo "Test failed: Response does not contain 'Michael Jackson'"
echo "Response 1: $response_1"
echo "Response 2: $response_2"
exit 1
else
echo "Test passed: Response from both nodes contains 'Michael Jackson'"
fi
# Stop both instances
kill $PID1 $PID2
# Check outputs
if grep -q "Connected to peer" output1.log && grep -q "Connected to peer" output2.log; then
echo "Test passed: Both instances discovered each other"
exit 0
else
echo "Test failed: Devices did not discover each other"
echo "Output of first instance:"
cat output1.log
echo "Output of second instance:"
cat output2.log
exit 1
fi
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,20 +104,16 @@ The native way to access models running on exo is using the exo library with pee

exo also starts a ChatGPT-compatible API endpoint on http://localhost:8000. Note: this is currently only supported by tail nodes (i.e. nodes selected to be at the end of the ring topology). Example request:

```
```sh
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama-3-70b",
"model": "llama-3-8b",
"messages": [{"role": "user", "content": "What is the meaning of exo?"}],
"temperature": 0.7
}'
```

```sh
curl -X POST http://localhost:8001/api/v1/chat -H "Content-Type: application/json" -d '{"messages": [{"role": "user", "content": "What is the meaning of life?"}]}'
```

## Debugging

Enable debug logs with the DEBUG environment variable (0-9).
Expand Down
4 changes: 2 additions & 2 deletions exo/api/chatgpt_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,10 @@ def build_prompt(tokenizer, messages: List[Message]):


class ChatGPTAPI:
def __init__(self, node: Node, inference_engine_classname: str):
def __init__(self, node: Node, inference_engine_classname: str, response_timeout_secs: int = 90):
self.node = node
self.inference_engine_classname = inference_engine_classname
self.response_timeout_secs = 90
self.response_timeout_secs = response_timeout_secs
self.app = web.Application()
self.prev_token_lens: Dict[str, int] = {}
self.stream_tasks: Dict[str, asyncio.Task] = {}
Expand Down
11 changes: 6 additions & 5 deletions exo/inference/test_inference_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ async def test_inference_engine(inference_engine_1: InferenceEngine, inference_e
"mlx-community/Meta-Llama-3-8B-Instruct-4bit",
))

asyncio.run(test_inference_engine(
TinygradDynamicShardInferenceEngine(),
TinygradDynamicShardInferenceEngine(),
"llama3-8b-sfr",
))
# TODO: Waiting on https://github.com/tinygrad/tinygrad/issues/5549
# asyncio.run(test_inference_engine(
# TinygradDynamicShardInferenceEngine(),
# TinygradDynamicShardInferenceEngine(),
# "llama3-8b-sfr",
# ))
3 changes: 2 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
parser.add_argument("--broadcast-port", type=int, default=5678, help="Broadcast port for discovery")
parser.add_argument("--wait-for-peers", type=int, default=0, help="Number of peers to wait to connect to before starting")
parser.add_argument("--chatgpt-api-port", type=int, default=8000, help="ChatGPT API port")
parser.add_argument("--chatgpt-api-response-timeout-secs", type=int, default=90, help="ChatGPT API response timeout in seconds")
parser.add_argument("--inference-engine", type=str, default=None, help="Inference engine to use")
args = parser.parse_args()

Expand Down Expand Up @@ -57,7 +58,7 @@
node = StandardNode(args.node_id, None, inference_engine, discovery, partitioning_strategy=RingMemoryWeightedPartitioningStrategy(), chatgpt_api_endpoint=f"http://localhost:{args.chatgpt_api_port}/v1/chat/completions", web_chat_url=f"http://localhost:{args.chatgpt_api_port}")
server = GRPCServer(node, args.node_host, args.node_port)
node.server = server
api = ChatGPTAPI(node, inference_engine.__class__.__name__)
api = ChatGPTAPI(node, inference_engine.__class__.__name__, response_timeout_secs=args.chatgpt_api_response_timeout_secs)

node.on_token.register("main_log").on_next(lambda _, tokens , __: print(inference_engine.tokenizer.decode(tokens) if hasattr(inference_engine, "tokenizer") else tokens))

Expand Down

0 comments on commit f554ae0

Please sign in to comment.