exo-explore · risingsunomi · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024 · Oct 11, 2024
diff --git a/exo/api/chatgpt_api.py b/exo/api/chatgpt_api.py
@@ -16,7 +16,6 @@
 from exo.models import model_base_shards
 from typing import Callable
 
-
 class Message:
   def __init__(self, role: str, content: Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]):
     self.role = role

diff --git a/exo/helpers.py b/exo/helpers.py
@@ -32,7 +32,6 @@ def get_system_info():
     return "Linux"
   return "Non-Mac, non-Linux system"
 
-
 def find_available_port(host: str = "", min_port: int = 49152, max_port: int = 65535) -> int:
   used_ports_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".exo_used_ports")
 

diff --git a/exo/inference/pytorch/helpers.py b/exo/inference/pytorch/helpers.py
@@ -0,0 +1,24 @@
+# Helper functions for pytorch inference
+# Some code coming from tinygrad but written towards pytorch
+
+import asyncio
+import aiohttp
+from tqdm import tqdm
+from pathlib import Path
+from typing import List
+
+async def fetch_file_async(session, url: str, output_path: Path):
+    async with session.get(url) as response:
+        response.raise_for_status()
+        with open(output_path, 'wb') as f:
+            async for chunk in response.content.iter_chunked(8192):
+                f.write(chunk)
+
+async def download_files(urls: List[str], output_paths: List[Path]):
+    async with aiohttp.ClientSession() as session:
+        tasks = []
+        for url, output_path in zip(urls, output_paths):
+            tasks.append(fetch_file_async(session, url, output_path))
+
+        for f in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Downloading files"):
+            await f
diff --git a/exo/inference/pytorch/inference.py b/exo/inference/pytorch/inference.py
@@ -0,0 +1,137 @@
+# experimental, based off of tinygrad/inference.py
+
+import numpy as np
+import torch
+import numpy as np
+import json
+from typing import Optional, Callable, Tuple
+from exo.inference.shard import Shard
+from exo.inference.inference_engine import InferenceEngine
+from exo.inference.pytorch.model.hf import ShardedHuggingFaceModel
+from exo.api.chatgpt_api import resolve_tokenizer
+from exo.helpers import DEBUG
+from transformers import DynamicCache
+
+class PyTorchDynamicShardInferenceEngine(InferenceEngine):
+    """
+    PyTorch Dynamic Shard Inference Engine for performing model inference with sharded models.
+    """
+
+    def __init__(self):
+        """
+        Initialize the inference engine.
+
+        Args:
+            debug (bool): If True, enables debug logging. Defaults to False.
+        """
+        self.shard = None
+        self.model = None
+        self.tokenizer = None
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    async def infer_prompt(
+        self, 
+        request_id: str, 
+        shard: Optional[Shard] = None, 
+        prompt: str = "", 
+        image_str: Optional[str] = None, 
+        inference_state: Optional[str] = None
+    ) -> Tuple[np.ndarray, str, bool]:
+        if DEBUG >= 2:
+            print("infer_prompt called")
+
+        await self.ensure_shard(shard)
+
+        # need to make this so inference_state is not a string
+        # cant use it with dynamic cache
+
+        tokens = self.tokenizer.encode(prompt, return_tensors="pt")
+
+        if DEBUG >= 2:
+            print(f"tokens: {tokens}\n")
+
+        output_data = self.model.forward_layers(
+            tokens
+        )
+
+        is_finished = output_data.size == 1 and output_data.item() in [self.tokenizer.eos_token_id]
+
+        if is_finished:
+            print(f"token from llm decode: {self.tokenizer.decode(output_data)}")
+
+
+        if DEBUG >= 2:
+            print(f"output_data: {output_data}\n")
+            print(f"output_data.size {output_data.size}\n")
+            print(f"output_data.item() {output_data.item()}")
+            print(f"finished: {is_finished}")
+            print(f"self.tokenizer.eos_token_id {self.tokenizer.eos_token_id}")
+            print(f"output_data[-1] {output_data[-1]}")
+            print(f"output_data.item() in [self.tokenizer.eos_token_id]: {output_data.item() in [self.tokenizer.eos_token_id]}")
+
+        return (
+            output_data,
+            "",
+            is_finished
+        )
+
+    async def infer_tensor(
+        self, 
+        request_id: str, 
+        shard: Shard, 
+        input_data: np.ndarray, 
+        inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
+
+        in_tensor = torch.tensor(input_data)
+
+        # Ensure input_data is 2D: [batch_size, seq_len]
+        if in_tensor.dim() == 1:
+            in_tensor = in_tensor.unsqueeze(0)  # Add a batch dimension: [1, seq_len]
+
+        if DEBUG >= 2:
+            print("infer_tensor called")
+            print(f"input_data: {input_data}\n")
+            print(f"in_tensor: {in_tensor}\n")
+
+        await self.ensure_shard(shard)
+
+        output_data = self.model.forward_layers(
+            in_tensor
+        )
+
+        is_finished = output_data.size == 1 and output_data.item() in [self.tokenizer.eos_token_id]
+
+        if DEBUG >= 2:
+            print(f"output_data: {output_data}\n")
+            print(f"output_data.size {output_data.size}\n")
+            print(f"output_data.item() {output_data.item()}")
+            print(f"finished: {is_finished}")
+            print(f"self.tokenizer.eos_token_id {self.tokenizer.eos_token_id}")
+            print(f"output_data[-1] {output_data[-1]}")
+            print(f"output_data.item() in [self.tokenizer.eos_token_id]: {output_data.item() in [self.tokenizer.eos_token_id]}")
+
+        return (
+            output_data,
+            "",
+            is_finished
+        )
+
+    async def ensure_shard(self, shard: Optional[Shard]):
+        """
+        Ensure the model shard is loaded and ready for inference.
+
+        Args:
+            shard (Optional[Shard]): Shard information for the model.
+        """
+        if self.shard == shard:
+            return
+
+        if DEBUG >= 2:
+            print(f"Loading new shard: {shard}")
+
+        self.model = ShardedHuggingFaceModel(shard)
+        self.tokenizer = await resolve_tokenizer(shard.model_id)
+        self.shard = shard
+
+        if DEBUG >= 2:
+            print(f"Shard loaded successfully: {shard}")
diff --git a/exo/inference/pytorch/model/hf.py b/exo/inference/pytorch/model/hf.py
@@ -0,0 +1,107 @@
+import torch
+import torch.nn as nn
+import numpy as np
+
+from transformers import AutoModelForCausalLM, LlamaConfig, DynamicCache, Cache
+from exo.inference.shard import Shard
+from exo.helpers import DEBUG
+from typing import Tuple
+
+from .utils import sample_logits
+
+class ShardedHuggingFaceModel(torch.nn.Module):
+    def __init__(self, shard: Shard):
+        super(ShardedHuggingFaceModel, self).__init__()
+
+        if DEBUG >= 2:
+            print(f"\nShardedHuggingFaceModel init with shard {shard}")
+
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.shard = shard
+
+
+
+        # Load the model
+        self.full_model = AutoModelForCausalLM.from_pretrained(
+            shard.model_id,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto"
+        )
+
+        # using llamaconfig not working setting layers manually
+        layers = []
+        for i in range(shard.start_layer, shard.end_layer + 1):
+            layer = self.full_model.model.layers[i]
+
+            if DEBUG >= 2:
+                print(f"Loading layers[{i}]")
+
+            layers.append(layer)
+
+        self.full_model.model.layers = nn.ModuleList(layers)
+
+        if DEBUG >= 2:
+            print(f"full_model.model layer: {len(self.full_model.model.layers)}")
+
+        # Embeddings and final layer norm
+        # used for doing what forward LlamaModel does in transformers
+        self.embed_tokens = self.full_model.model.embed_tokens
+        self.norm = self.full_model.model.norm
+
+        # self.past_key_values = DynamicCache()
+
+    def forward_layers(
+        self,
+        input_data: torch.tensor
+    ) -> Tuple[np.ndarray, list]:
+        """
+        Forward pass through the specified layers.
+
+        Note: past_key_values not working for model, might be a library bug
+        """ 
+        if DEBUG >= 2:
+            print("forward_layer call")
+            print(f"input_data: {input_data}")
+            print(f"shard {self.shard.to_dict()}")
+
+        hidden_states = input_data
+
+        # Forward pass through the layer
+        if DEBUG >= 2:
+            print(f"\n[layer model] {self.full_model.model}")
+            print(f"IN hidden_states {hidden_states}")
+            # print(f"past_kvs {past_kvs}")
+
+        self.full_model.model.layer_idx = 5
+        layer_outputs = self.full_model.model(
+            hidden_states,
+            # position_ids=position_ids,
+            # inputs_embeds=position_embeddings,
+            # past_key_values=self.past_key_values,
+            use_cache=False # not enough vram for using cache ;_;
+        )
+
+        if DEBUG >= 2:
+            print(f"OUT hidden_states {hidden_states}")
+            # print(f"\nlayer_outputs: {layer_outputs}")
+
+        hidden_states = layer_outputs.last_hidden_state
+        # self.past_key_values = layer_outputs.past_key_values
+
+        print(f"2 is_last_layer {self.shard.is_last_layer()}")
+        if self.shard.is_last_layer():
+            hs_norm = self.norm(hidden_states)
+            hs_lm_head = self.full_model.lm_head(hs_norm).float()
+
+            # Use the sampling function with default settings
+            output_token = sample_logits(
+                hs_lm_head[:, -1, :]).cpu().numpy().flatten()
+
+            if DEBUG >= 2:
+                print(f"hs_norm: {hs_norm}")
+                print(f"hs_lm_head: {hs_lm_head}")
+                print(f"output_token: {output_token}")
+
+            return output_token
+
+        return hidden_states.cpu().numpy()
diff --git a/exo/inference/pytorch/model/utils.py b/exo/inference/pytorch/model/utils.py
@@ -0,0 +1,63 @@
+import torch
+from torch.nn import functional as F
+
+def sample_logits(logits, temperature=1.0, top_k=0, top_p=1.0, alpha_f=0.0, alpha_p=0.0):
+    """
+    Sample tokens from logits using temperature, top-k, and top-p (nucleus) sampling.
+
+    Args:
+        logits (torch.Tensor): The logits distribution to sample from.
+        temperature (float): Temperature for scaling logits.
+        top_k (int): The number of top tokens to consider for sampling.
+        top_p (float): The cumulative probability threshold for nucleus sampling.
+        alpha_f (float): Penalty factor for repetition frequency.
+        alpha_p (float): Penalty for repeated selection.
+
+    Returns:
+        torch.Tensor: The selected token index.
+    """
+
+     # Ensure logits are float
+    logits = logits.float()
+
+    # If temperature is very low, just use argmax
+    if temperature < 1e-6:
+        return logits.argmax(dim=-1)
+
+    # Alpha sampling (adjusting logits based on past selections)
+    if alpha_f > 0.0 or alpha_p > 0.0:
+        logits -= (sample_logits.alpha_counter * alpha_f + (sample_logits.alpha_counter > 0) * alpha_p)
+
+    # Replace NaNs with -inf to prevent softmax issues
+    logits = torch.where(torch.isnan(logits), torch.full_like(logits, -float('inf')), logits)
+
+    # Apply temperature scaling
+    logits = logits / temperature
+
+    # Top-k sampling
+    if top_k > 0:
+        top_k = min(top_k, logits.size(-1))
+        top_k_values, top_k_indices = torch.topk(logits, top_k, dim=-1)
+        logits = torch.full_like(logits, -float('inf'))
+        logits.scatter_(-1, top_k_indices, top_k_values)
+
+    # Top-p sampling
+    if 0 < top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        sorted_logits[sorted_indices_to_remove] = -float('inf')
+        logits = sorted_logits
+
+    # Apply softmax to get probabilities
+    probabilities = F.softmax(logits, dim=-1)
+
+    # Sample from the probabilities
+    sampled_token = torch.multinomial(probabilities, 1)
+
+    return sampled_token.squeeze()
diff --git a/exo/inference/pytorch/test_build_transformer.py b/exo/inference/pytorch/test_build_transformer.py
@@ -0,0 +1,21 @@
+import unittest
+from unittest.mock import patch, MagicMock
+from pathlib import Path
+import torch
+from exo.inference.shard import Shard
+from exo.inference.pytorch.helpers import build_transformer
+
+class TestBuildTransformer(unittest.TestCase):
+
+    def test_build_transformer(self):
+        # Call the build_transformer function
+        model = build_transformer(
+            "gpt2", 
+            quantize=True, 
+            device="cuda"
+        )
+
+        self.assertIsNotNone(model)
+
+if __name__ == '__main__':
+    unittest.main()