ROCm
diff --git a/‎.gitignore
+5-1 b/‎.gitignore
+5-1
diff --git a/‎byte_infer_perf/llm_perf/README.md
+5-5 b/‎byte_infer_perf/llm_perf/README.md
+5-5
diff --git a/‎byte_infer_perf/llm_perf/backends/GPU/gpu_engine.py
-154 b/‎byte_infer_perf/llm_perf/backends/GPU/gpu_engine.py
-154
diff --git a/‎byte_infer_perf/llm_perf/backends/GPU/gpu_inferencer.py
+71 b/‎byte_infer_perf/llm_perf/backends/GPU/gpu_inferencer.py
+71
diff --git a/‎byte_infer_perf/llm_perf/backends/GPU/gpu_mp_engine.py
+73 b/‎byte_infer_perf/llm_perf/backends/GPU/gpu_mp_engine.py
+73
diff --git a/‎byte_infer_perf/llm_perf/backends/GPU/gpu_process_messager.py
-49 b/‎byte_infer_perf/llm_perf/backends/GPU/gpu_process_messager.py
-49
@@ -25,4 +25,8 @@ init_env.sh
 
 byte_infer_perf/llm_perf/download
 byte_infer_perf/llm_perf/model_zoo/sota
-byte_infer_perf/llm_perf/reports
+byte_infer_perf/llm_perf/reports
+
+
+workspace
+test
@@ -8,7 +8,7 @@ Vendors can refer to this document for guidance on building backend: [Byte LLM P
 
 ## Installation
 ```shell
-pip3 install torch==2.0.1
+pip3 install torch==2.1.0
 pip3 install -r requirements.txt
 ```
 
@@ -22,11 +22,11 @@ To start llm_perf, there are 3 steps:
 
 You can run following command automate all steps with chatglm2 model on GPU backend
 ```shell
-python3 byte_infer_perf/llm_perf/launch.py --task chatglm2-torch-fp16-6b --hardware_type GPU
+python3 byte_infer_perf/llm_perf/launch.py --hardware_type GPU --task chatglm2-torch-fp16-6b 
 ```
 
 ## Models
 The list of supported models is:
-* ChatGLM
-* ChatGLM2
-* Chinese-LLaMA-2
+* [chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)
+* [chinese-llama-2-13b](https://huggingface.co/hfl/chinese-llama-2-13b)
+* [Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
@@ -0,0 +1,71 @@
+import os
+from typing import Dict, List, Any
+from dataclasses import dataclass
+
+from llm_perf.core.generation import GenerateRequest
+from llm_perf.core.inferencer import CoreInferencer
+from llm_perf.backends.GPU.gpu_mp_engine import GpuMpEngine
+from llm_perf.utils.logger import logger
+
+class GpuInferencer(CoreInferencer):
+    def __init__(self, model_impl, xpu_cfg):
+        super().__init__()
+
+        self.tp_size = xpu_cfg["tp_size"]
+        self.pad_token_id = xpu_cfg["pad_token_id"]
+        self.mp_engine = GpuMpEngine(self.tp_size, model_impl, xpu_cfg)
+
+    def prepare_inputs(self, tasks: List["CoreInferencer.Task"]):
+        all_input_ids = []
+        all_position_ids = []
+        all_attention_mask = []
+
+        max_seq_len = -1
+        for task in tasks:
+            cur_id_len = len(task.request.input_ids) + len(task.generate_ids)
+            max_seq_len = cur_id_len if cur_id_len > max_seq_len else max_seq_len
+
+        for task in tasks:
+            cur_id_len = len(task.request.input_ids) + len(task.generate_ids)
+            pad_len = max_seq_len - cur_id_len
+            # using left padding
+            input_ids = (
+                [self.pad_token_id] * pad_len + 
+                task.request.input_ids + 
+                task.generate_ids
+            )
+            pos_ids = (
+                [i for i in range(max_seq_len)]
+            )
+            attention_mask = (
+                [0] * pad_len + 
+                [1] * cur_id_len
+            )
+            all_input_ids.append(input_ids)
+            all_position_ids.append(pos_ids)
+            all_attention_mask.append(attention_mask)
+        
+        # create model_inputs
+        model_inputs = {}
+        model_inputs["input_ids"] = all_input_ids
+        model_inputs["position_ids"] = all_position_ids
+        model_inputs["attention_mask"] = all_attention_mask
+
+        return model_inputs
+
+
+    def infer(self, tasks: List["CoreInferencer.Task"]):
+        input_dict = self.prepare_inputs(tasks)
+        outputs = self.mp_engine.mp_forward(input_dict)
+
+        input_logits = outputs.logits[..., :-1, :].contiguous()
+        next_tokens_logits = outputs.logits[:, -1, :].contiguous()
+        logger.debug(
+            f"tensor shape: {outputs.logits.shape}\n"
+            f"next tokens logits: {next_tokens_logits.shape}\n"
+            f"input logits: {input_logits.shape}\n"
+        )
+        return {
+            "input_logits": input_logits,
+            "last_logits": next_tokens_logits,
+        }
@@ -0,0 +1,73 @@
+import os
+from multiprocessing import Queue
+
+import torch
+import torch.nn as nn
+
+from llm_perf.core.mp_engine import CoreMpEngine
+from llm_perf.utils.logger import logger
+
+class GpuMpEngine(CoreMpEngine):
+    def __init__(self, world_size: int, model_impl: nn.Module, xpu_cfg) -> None:
+        super().__init__(world_size, model_impl, xpu_cfg)
+
+
+    def build_inputs(self, forward_inputs):
+        forward_inputs["input_ids"] = torch.tensor(forward_inputs["input_ids"]).cuda()
+        forward_inputs["position_ids"] = torch.tensor(forward_inputs["position_ids"]).cuda()
+        forward_inputs["attention_mask"] = torch.tensor(forward_inputs["attention_mask"]).cuda()
+        return forward_inputs
+        
+    @torch.no_grad()
+    def mp_loop_worker(
+        self, 
+        local_rank: int, 
+        world_size: int, 
+        input_queue: Queue, 
+        output_queue: Queue, 
+        model_impl, 
+        xpu_config
+    ):
+        try:
+            torch.manual_seed(1)
+
+            # set rank and world_size
+            os.environ["RANK"] = str(local_rank)
+            os.environ["LOCAL_RANK"] = str(local_rank)
+            os.environ["WORLD_SIZE"] = str(world_size)
+            os.environ["LOCAL_WORLD_SIZE"] = str(world_size)
+            
+
+            # set device
+            torch.cuda.set_device(local_rank)
+
+            # create and init model based on model_impl and xpu_config
+            model = model_impl(xpu_config)
+        
+            # current rank is ready
+            output_queue.put("ready")
+            logger.info(f"{local_rank}/{world_size} rank is ready")
+
+            # model process loop
+            while True:
+                (
+                    forward_inputs,
+                ) = input_queue.get(block=True)
+
+                # model forward
+                inputs = self.build_inputs(forward_inputs)
+                logits = model.forward(inputs)
+
+                if local_rank == 0:
+                    output_queue.put(logits)
+                torch.cuda.synchronize()
+
+        except Exception as e:
+            logger.exception(f"[BUG] engine _load_and_listen failed, no more requests will be handled. {e}")
+            output_queue.put(RuntimeError("[BUG] fatal exception in model subprocess"))
+            
+
+    def mp_forward(self, *args):
+        for i in range(self.world_size):
+            self._input_queues.put(args, True)
+        return self._output_queues.get(True)