Skip to content

Commit 5196b34

Browse files
authored
[llm_perf] optimzie llm_perf code structure, fix some known issues, replace torchrun with mp_engine to support custom vendor implementation. (bytedance#79)
1 parent c1d1835 commit 5196b34

38 files changed

+6641
-3016
lines changed

.gitignore

+5-1
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,8 @@ init_env.sh
2525

2626
byte_infer_perf/llm_perf/download
2727
byte_infer_perf/llm_perf/model_zoo/sota
28-
byte_infer_perf/llm_perf/reports
28+
byte_infer_perf/llm_perf/reports
29+
30+
31+
workspace
32+
test

byte_infer_perf/llm_perf/README.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ Vendors can refer to this document for guidance on building backend: [Byte LLM P
88

99
## Installation
1010
```shell
11-
pip3 install torch==2.0.1
11+
pip3 install torch==2.1.0
1212
pip3 install -r requirements.txt
1313
```
1414

@@ -22,11 +22,11 @@ To start llm_perf, there are 3 steps:
2222

2323
You can run following command automate all steps with chatglm2 model on GPU backend
2424
```shell
25-
python3 byte_infer_perf/llm_perf/launch.py --task chatglm2-torch-fp16-6b --hardware_type GPU
25+
python3 byte_infer_perf/llm_perf/launch.py --hardware_type GPU --task chatglm2-torch-fp16-6b
2626
```
2727

2828
## Models
2929
The list of supported models is:
30-
* ChatGLM
31-
* ChatGLM2
32-
* Chinese-LLaMA-2
30+
* [chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)
31+
* [chinese-llama-2-13b](https://huggingface.co/hfl/chinese-llama-2-13b)
32+
* [Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)

byte_infer_perf/llm_perf/backends/GPU/gpu_engine.py

-154
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import os
2+
from typing import Dict, List, Any
3+
from dataclasses import dataclass
4+
5+
from llm_perf.core.generation import GenerateRequest
6+
from llm_perf.core.inferencer import CoreInferencer
7+
from llm_perf.backends.GPU.gpu_mp_engine import GpuMpEngine
8+
from llm_perf.utils.logger import logger
9+
10+
class GpuInferencer(CoreInferencer):
11+
def __init__(self, model_impl, xpu_cfg):
12+
super().__init__()
13+
14+
self.tp_size = xpu_cfg["tp_size"]
15+
self.pad_token_id = xpu_cfg["pad_token_id"]
16+
self.mp_engine = GpuMpEngine(self.tp_size, model_impl, xpu_cfg)
17+
18+
def prepare_inputs(self, tasks: List["CoreInferencer.Task"]):
19+
all_input_ids = []
20+
all_position_ids = []
21+
all_attention_mask = []
22+
23+
max_seq_len = -1
24+
for task in tasks:
25+
cur_id_len = len(task.request.input_ids) + len(task.generate_ids)
26+
max_seq_len = cur_id_len if cur_id_len > max_seq_len else max_seq_len
27+
28+
for task in tasks:
29+
cur_id_len = len(task.request.input_ids) + len(task.generate_ids)
30+
pad_len = max_seq_len - cur_id_len
31+
# using left padding
32+
input_ids = (
33+
[self.pad_token_id] * pad_len +
34+
task.request.input_ids +
35+
task.generate_ids
36+
)
37+
pos_ids = (
38+
[i for i in range(max_seq_len)]
39+
)
40+
attention_mask = (
41+
[0] * pad_len +
42+
[1] * cur_id_len
43+
)
44+
all_input_ids.append(input_ids)
45+
all_position_ids.append(pos_ids)
46+
all_attention_mask.append(attention_mask)
47+
48+
# create model_inputs
49+
model_inputs = {}
50+
model_inputs["input_ids"] = all_input_ids
51+
model_inputs["position_ids"] = all_position_ids
52+
model_inputs["attention_mask"] = all_attention_mask
53+
54+
return model_inputs
55+
56+
57+
def infer(self, tasks: List["CoreInferencer.Task"]):
58+
input_dict = self.prepare_inputs(tasks)
59+
outputs = self.mp_engine.mp_forward(input_dict)
60+
61+
input_logits = outputs.logits[..., :-1, :].contiguous()
62+
next_tokens_logits = outputs.logits[:, -1, :].contiguous()
63+
logger.debug(
64+
f"tensor shape: {outputs.logits.shape}\n"
65+
f"next tokens logits: {next_tokens_logits.shape}\n"
66+
f"input logits: {input_logits.shape}\n"
67+
)
68+
return {
69+
"input_logits": input_logits,
70+
"last_logits": next_tokens_logits,
71+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import os
2+
from multiprocessing import Queue
3+
4+
import torch
5+
import torch.nn as nn
6+
7+
from llm_perf.core.mp_engine import CoreMpEngine
8+
from llm_perf.utils.logger import logger
9+
10+
class GpuMpEngine(CoreMpEngine):
11+
def __init__(self, world_size: int, model_impl: nn.Module, xpu_cfg) -> None:
12+
super().__init__(world_size, model_impl, xpu_cfg)
13+
14+
15+
def build_inputs(self, forward_inputs):
16+
forward_inputs["input_ids"] = torch.tensor(forward_inputs["input_ids"]).cuda()
17+
forward_inputs["position_ids"] = torch.tensor(forward_inputs["position_ids"]).cuda()
18+
forward_inputs["attention_mask"] = torch.tensor(forward_inputs["attention_mask"]).cuda()
19+
return forward_inputs
20+
21+
@torch.no_grad()
22+
def mp_loop_worker(
23+
self,
24+
local_rank: int,
25+
world_size: int,
26+
input_queue: Queue,
27+
output_queue: Queue,
28+
model_impl,
29+
xpu_config
30+
):
31+
try:
32+
torch.manual_seed(1)
33+
34+
# set rank and world_size
35+
os.environ["RANK"] = str(local_rank)
36+
os.environ["LOCAL_RANK"] = str(local_rank)
37+
os.environ["WORLD_SIZE"] = str(world_size)
38+
os.environ["LOCAL_WORLD_SIZE"] = str(world_size)
39+
40+
41+
# set device
42+
torch.cuda.set_device(local_rank)
43+
44+
# create and init model based on model_impl and xpu_config
45+
model = model_impl(xpu_config)
46+
47+
# current rank is ready
48+
output_queue.put("ready")
49+
logger.info(f"{local_rank}/{world_size} rank is ready")
50+
51+
# model process loop
52+
while True:
53+
(
54+
forward_inputs,
55+
) = input_queue.get(block=True)
56+
57+
# model forward
58+
inputs = self.build_inputs(forward_inputs)
59+
logits = model.forward(inputs)
60+
61+
if local_rank == 0:
62+
output_queue.put(logits)
63+
torch.cuda.synchronize()
64+
65+
except Exception as e:
66+
logger.exception(f"[BUG] engine _load_and_listen failed, no more requests will be handled. {e}")
67+
output_queue.put(RuntimeError("[BUG] fatal exception in model subprocess"))
68+
69+
70+
def mp_forward(self, *args):
71+
for i in range(self.world_size):
72+
self._input_queues.put(args, True)
73+
return self._output_queues.get(True)

byte_infer_perf/llm_perf/backends/GPU/gpu_process_messager.py

-49
This file was deleted.

0 commit comments

Comments
 (0)