Skip to content

Commit

Permalink
Update TensorRT-LLM (NVIDIA#1427)
Browse files Browse the repository at this point in the history
* Update TensorRT-LLM

---------

Co-authored-by: meghagarwal <[email protected]>
  • Loading branch information
kaiyux and megha95 authored Apr 9, 2024
1 parent 118b3d7 commit 035b99e
Show file tree
Hide file tree
Showing 216 changed files with 733,895 additions and 2,577 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
*.a filter=lfs diff=lfs merge=lfs -text
*.lib filter=lfs diff=lfs merge=lfs -text
2 changes: 2 additions & 0 deletions benchmarks/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ python3 prepare_dataset.py \
[--time-delay-dist exponential_dist] \
dataset
--dataset-name <name of the dataset> \
--dataset-split <split of the dataset to use> \
--dataset-input-key <dataset dictionary key for input> \
--dataset-prompt-key <dataset dictionary key for prompt> \
--dataset-output-key <dataset dictionary key for output> \
Expand All @@ -99,6 +100,7 @@ python3 prepare_dataset.py \
--output cnn_dailymail.json
dataset
--dataset-name cnn_dailymail \
--dataset-split validation \
--dataset-config-name 3.0.0 \
--dataset-input-key article \
--dataset-prompt "Summarize the following article:" \
Expand Down
321 changes: 261 additions & 60 deletions benchmarks/cpp/gptManagerBenchmark.cpp

Large diffs are not rendered by default.

9 changes: 4 additions & 5 deletions benchmarks/cpp/utils/prepare_real_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,10 @@ def load_dataset_from_hf(dataset_config: DatasetConfig):
type=str,
default=None,
help=f"Dataset config name in HuggingFace (if exists).")
@click.option(
"--dataset-split",
type=str,
default=None,
help=f"Split of the dataset to use. Default will include all splits.")
@click.option("--dataset-split",
type=str,
required=True,
help=f"Split of the dataset to use.")
@click.option("--dataset-input-key",
required=True,
type=str,
Expand Down
8 changes: 8 additions & 0 deletions benchmarks/python/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,11 @@ def parse_arguments():
help=
"Check the estimated memory usage against the total GPU memory. Raise error if the estimated memory requirement is bigger than the total GPU memory"
"Warning: only GPT model family is supported for now")
parser.add_argument(
'--dump_profile',
default=False,
action='store_true',
help="Print profile information per layer (default = disabled)")
return parser.parse_args()


Expand Down Expand Up @@ -310,6 +315,9 @@ def main(args):
if args.build_only:
return

if args.dump_profile and benchmark_profiler is not None:
benchmark_profiler.set_recording_perf_profile(True)

start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
benchmarker.print_report_header(args.csv,
Expand Down
5 changes: 5 additions & 0 deletions benchmarks/python/benchmark_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@ class BenchmarkProfiler(object):
timer_dict: dict
aux_info: dict
started: bool
is_recording_perf_profile: bool

def __init__(self):
self.cuda_event_dict = {}
self.timer_dict = {}
self.aux_info = {}
self.started = False
self.is_recording_perf_profile = False

def clean(self):
self.cuda_event_dict = {}
Expand Down Expand Up @@ -75,3 +77,6 @@ def add_aux_info(self, aux_name: str, add_value):
if not self.started:
return
self.aux_info[aux_name] += add_value

def set_recording_perf_profile(self, value: bool):
self.is_recording_perf_profile = value
6 changes: 6 additions & 0 deletions benchmarks/python/bert_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ def __init__(self, args, batch_sizes, in_lens, rank, world_size):
self.session = tensorrt_llm.runtime.Session.from_serialized_engine(
engine_buffer)

# Print context memory size for CI/CD to track.
context_mem_size = self.session.context_mem_size
print(
f"Allocated {context_mem_size / 1048576.0:.2f} MiB for execution context memory."
)

def get_config(self):
for inlen in self.in_lens:
if inlen > self.max_input_len:
Expand Down
83 changes: 56 additions & 27 deletions benchmarks/python/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,15 @@ def build_gpt(args):
quant_mode = quant_config.quant_mode

builder = Builder()
builder_config_extra_kwargs = {}
if get_model_family(args.model) == 'mamba':
builder_config_extra_kwargs['mamba_d_state'] = build_config[
'mamba_d_state']
builder_config_extra_kwargs['mamba_d_conv'] = build_config[
'mamba_d_conv']
builder_config_extra_kwargs['mamba_expand'] = build_config[
'mamba_expand']
builder_config_extra_kwargs['max_beam_width'] = max_beam_width
builder_config = builder.create_builder_config(
name=args.model,
precision=args.dtype,
Expand All @@ -246,7 +255,8 @@ def build_gpt(args):
quant_mode=quant_mode,
use_refit=False,
opt_level=build_config['builder_opt'],
strongly_typed=strongly_typed)
strongly_typed=strongly_typed,
**builder_config_extra_kwargs)
engine_name = get_engine_name(args.model, args.dtype, world_size,
runtime_rank)

Expand Down Expand Up @@ -360,7 +370,8 @@ def build_gpt(args):
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.LLaMAForCausalLM(config)

tensorrt_llm_model = optimize_model(tensorrt_llm_model,
use_fused_mlp=True)
elif family == "gptj":
config = {
'architecture': 'GPTJForCausalLM',
Expand Down Expand Up @@ -524,7 +535,9 @@ def build_gpt(args):
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.BloomForCausalLM(config)

tensorrt_llm_model = optimize_model(
tensorrt_llm_model,
use_parallel_embedding=config.use_parallel_embedding)
elif family == "falcon":
config = {
'architecture':
Expand Down Expand Up @@ -666,32 +679,45 @@ def build_gpt(args):

elif family == "qwen":
config = {
'architecture': 'QWenForCausalLM',
'dtype': args.dtype,
'num_hidden_layers': build_config['num_layers'],
'num_attention_heads': build_config['num_heads'],
'hidden_size': build_config['hidden_size'],
'intermediate_size': build_config['inter_size'],
'num_key_value_heads': num_kv_heads,
'vocab_size': build_config['vocab_size'],
'position_embedding_type': 'rope_gpt_neox',
'max_position_embeddings': build_config['n_positions'],
'hidden_act': build_config['hidden_act'],
'rotary_base': 10000.0,
'norm_epsilon': 1e-06,
'architecture':
'QWenForCausalLM',
'dtype':
args.dtype,
'num_hidden_layers':
build_config['num_layers'],
'num_attention_heads':
build_config['num_heads'],
'num_key_value_heads':
build_config['num_heads'] if build_config['num_kv_heads'] is None
else build_config['num_kv_heads'],
'hidden_size':
build_config['hidden_size'],
'intermediate_size':
build_config['inter_size'],
'vocab_size':
build_config['vocab_size'],
'position_embedding_type':
'rope_gpt_neox',
'max_position_embeddings':
build_config['n_positions'],
'hidden_act':
build_config['hidden_act'],
'quantization': {
'group_size': 128,
'quant_algo': quant_algo,
'kv_cache_quant_algo': kv_cache_quant_algo,
'group_size': 128
'kv_cache_quant_algo': kv_cache_quant_algo
},
'mapping': {
'world_size': world_size,
'tp_size': world_size,
'tp_size': world_size
},
'moe_num_experts':
build_config["moe_num_experts"],
'moe_top_k':
build_config["moe_top_k"],
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.QWenForCausalLM(config)

elif family == "mamba":
config = {
'architecture': 'MambaLMHeadModel',
Expand All @@ -716,10 +742,6 @@ def build_gpt(args):
else:
raise Exception(f'Unexpected model: {args.model}')

if family in ['llama']:
tensorrt_llm_model = optimize_model(tensorrt_llm_model,
use_fused_mlp=True)

# Module -> Network
network = builder.create_network()
network.trt_network.name = engine_name
Expand Down Expand Up @@ -1225,14 +1247,21 @@ def build_enc_dec(args):
def main(args):
logger.set_level(args.log_level)
if args.model in get_allowed_models(benchmark_type="gpt"):
build_gpt(args)
engine = build_gpt(args)[0]
engine_size = engine.nbytes
elif args.model in get_allowed_models(benchmark_type="bert"):
build_bert(args)
engine = build_bert(args)[0]
engine_size = engine.nbytes
elif args.model in get_allowed_models(benchmark_type="enc_dec"):
build_enc_dec(args)
encoder_engine, decoder_engine = build_enc_dec(args)[:2]
engine_size = encoder_engine.nbytes + decoder_engine.nbytes
else:
raise Exception(f'Unexpected model: {args.model}')

# Print engine size for CI/CD to track.
logger.info(
f"Total engine size per GPU is {engine_size / 1048576:.2f} MiB.")


if __name__ == '__main__':
mp.set_start_method('spawn')
Expand Down
6 changes: 6 additions & 0 deletions benchmarks/python/enc_dec_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,12 @@ def read_config(component):
self.decoder_runtime_mapping,
)

# Print context memory size for CI/CD to track.
context_mem_size = self.encoder_session.context_mem_size + self.decoder_session.context_mem_size
print(
f"Allocated {context_mem_size / 1048576.0:.2f} MiB for execution context memory."
)

def get_config(self):
if 'whisper' in self.model_name:
print(
Expand Down
62 changes: 62 additions & 0 deletions benchmarks/python/gpt_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from dataclasses import asdict
from math import ceil

import pandas as pd
import torch

import tensorrt_llm
Expand Down Expand Up @@ -93,6 +94,7 @@ def __init__(self, args, batch_sizes, in_out_lens, rank, world_size):
# Plugins
self.use_gpt_attention_plugin = False
self.remove_input_padding = False
self.use_mamba_conv1d_plugin = False
if args.mode == 'plugin':
self.use_gpt_attention_plugin = True
self.remove_input_padding = True
Expand Down Expand Up @@ -129,6 +131,7 @@ def __init__(self, args, batch_sizes, in_out_lens, rank, world_size):
remove_input_padding=self.remove_input_padding,
quant_mode=self.quant_mode,
use_custom_all_reduce=self.use_custom_all_reduce,
mamba_conv1d_plugin=self.use_mamba_conv1d_plugin,
)
if args.model == 'chatglm_6b':
self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
Expand Down Expand Up @@ -177,6 +180,12 @@ def __init__(self, args, batch_sizes, in_out_lens, rank, world_size):
self.runtime_mapping,
cuda_graph_mode=self.cuda_graph_mode)

# Print context memory size for CI/CD to track.
context_mem_size = self.decoder.context_mem_size
print(
f"Allocated {context_mem_size / 1048576.0:.2f} MiB for execution context memory."
)

def get_config(self):
for inlen, outlen in self.in_out_lens:
if inlen > self.max_input_len or outlen > self.max_output_len:
Expand Down Expand Up @@ -338,3 +347,56 @@ def report(self,
kv_pairs = [f"{k} {v}" for k, v in report_dict.items()]
line = '[BENCHMARK] ' + " ".join(kv_pairs)
print(line)

if benchmark_profiler is not None and benchmark_profiler.is_recording_perf_profile:
perf_profile_data = self.decoder.profiler.results
if not perf_profile_data:
tensorrt_llm.logger.error("profiler data is empty")
return

ctx_layers = list()
generation_layers = list()
start = 0
ctx_iter_cnt = 0
generation_iter_cnt = 0

# split context/generations layer information
for idx, layer_info in enumerate(perf_profile_data):
if layer_info[0] == "step":
if layer_info[1] == 0:
ctx_layers.extend(perf_profile_data[start:idx])
ctx_iter_cnt += 1
else:
generation_layers.extend(perf_profile_data[start:idx])
generation_iter_cnt += 1
start = idx + 1

# Reduce all data
def reduce_layer_data(layers):
layer_infos = dict()
for layer in layers:
if layer[0] in layer_infos:
layer_infos[layer[0]] += layer[1]
else:
layer_infos[layer[0]] = layer[1]
return layer_infos

# Dump kernel data
def dump_kernel_profile_table(name: str, profile_data: list,
iter_cnt: int):
table = pd.DataFrame(
[[k, '{:0.3f}'.format(v)] for k, v in profile_data.items()],
columns=['{} Phase LayerName'.format(name), 'times (ms)'])

def ljust(s):
s = s.astype(str).str.strip()
return s.str.ljust(s.str.len().max())

print(table.apply(ljust).to_string(index=False, justify='left'))
print("{} phase step iter: {}".format(name, iter_cnt))

ctx_layer_infos = reduce_layer_data(ctx_layers)
generation_layer_infos = reduce_layer_data(generation_layers)
dump_kernel_profile_table("Context", ctx_layer_infos, ctx_iter_cnt)
dump_kernel_profile_table("Generation", generation_layer_infos,
generation_iter_cnt)
Loading

0 comments on commit 035b99e

Please sign in to comment.