Update TensorRT-LLM (NVIDIA#1427)

* Update TensorRT-LLM --------- Co-authored-by: meghagarwal <[email protected]>
nicks64 · Apr 9, 2024 · 035b99e · 035b99e
1 parent 118b3d7
commit 035b99e
Show file tree

Hide file tree

Showing 216 changed files with 733,895 additions and 2,577 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1 +1,2 @@
 *.a filter=lfs diff=lfs merge=lfs -text
+*.lib filter=lfs diff=lfs merge=lfs -text
diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -83,6 +83,7 @@ python3 prepare_dataset.py \
     [--time-delay-dist exponential_dist] \
     dataset
     --dataset-name <name of the dataset> \
+    --dataset-split <split of the dataset to use> \
     --dataset-input-key <dataset dictionary key for input> \
     --dataset-prompt-key <dataset dictionary key for prompt> \
     --dataset-output-key <dataset dictionary key for output> \
@@ -99,6 +100,7 @@ python3 prepare_dataset.py \
     --output cnn_dailymail.json
     dataset
     --dataset-name cnn_dailymail \
+    --dataset-split validation \
     --dataset-config-name 3.0.0 \
     --dataset-input-key article \
     --dataset-prompt "Summarize the following article:" \

diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
diff --git a/benchmarks/cpp/utils/prepare_real_data.py b/benchmarks/cpp/utils/prepare_real_data.py
@@ -125,11 +125,10 @@ def load_dataset_from_hf(dataset_config: DatasetConfig):
               type=str,
               default=None,
               help=f"Dataset config name in HuggingFace (if exists).")
-@click.option(
-    "--dataset-split",
-    type=str,
-    default=None,
-    help=f"Split of the dataset to use. Default will include all splits.")
+@click.option("--dataset-split",
+              type=str,
+              required=True,
+              help=f"Split of the dataset to use.")
 @click.option("--dataset-input-key",
               required=True,
               type=str,

diff --git a/benchmarks/python/benchmark.py b/benchmarks/python/benchmark.py
@@ -244,6 +244,11 @@ def parse_arguments():
         help=
         "Check the estimated memory usage against the total GPU memory. Raise error if the estimated memory requirement is bigger than the total GPU memory"
         "Warning: only GPT model family is supported for now")
+    parser.add_argument(
+        '--dump_profile',
+        default=False,
+        action='store_true',
+        help="Print profile information per layer (default = disabled)")
     return parser.parse_args()
 
 
@@ -310,6 +315,9 @@ def main(args):
     if args.build_only:
         return
 
+    if args.dump_profile and benchmark_profiler is not None:
+        benchmark_profiler.set_recording_perf_profile(True)
+
     start = torch.cuda.Event(enable_timing=True)
     end = torch.cuda.Event(enable_timing=True)
     benchmarker.print_report_header(args.csv,

diff --git a/benchmarks/python/benchmark_profiler.py b/benchmarks/python/benchmark_profiler.py
@@ -21,12 +21,14 @@ class BenchmarkProfiler(object):
     timer_dict: dict
     aux_info: dict
     started: bool
+    is_recording_perf_profile: bool
 
     def __init__(self):
         self.cuda_event_dict = {}
         self.timer_dict = {}
         self.aux_info = {}
         self.started = False
+        self.is_recording_perf_profile = False
 
     def clean(self):
         self.cuda_event_dict = {}
@@ -75,3 +77,6 @@ def add_aux_info(self, aux_name: str, add_value):
         if not self.started:
             return
         self.aux_info[aux_name] += add_value
+
+    def set_recording_perf_profile(self, value: bool):
+        self.is_recording_perf_profile = value
diff --git a/benchmarks/python/bert_benchmark.py b/benchmarks/python/bert_benchmark.py
@@ -64,6 +64,12 @@ def __init__(self, args, batch_sizes, in_lens, rank, world_size):
         self.session = tensorrt_llm.runtime.Session.from_serialized_engine(
             engine_buffer)
 
+        # Print context memory size for CI/CD to track.
+        context_mem_size = self.session.context_mem_size
+        print(
+            f"Allocated {context_mem_size / 1048576.0:.2f} MiB for execution context memory."
+        )
+
     def get_config(self):
         for inlen in self.in_lens:
             if inlen > self.max_input_len:

diff --git a/benchmarks/python/build.py b/benchmarks/python/build.py
@@ -223,6 +223,15 @@ def build_gpt(args):
     quant_mode = quant_config.quant_mode
 
     builder = Builder()
+    builder_config_extra_kwargs = {}
+    if get_model_family(args.model) == 'mamba':
+        builder_config_extra_kwargs['mamba_d_state'] = build_config[
+            'mamba_d_state']
+        builder_config_extra_kwargs['mamba_d_conv'] = build_config[
+            'mamba_d_conv']
+        builder_config_extra_kwargs['mamba_expand'] = build_config[
+            'mamba_expand']
+        builder_config_extra_kwargs['max_beam_width'] = max_beam_width
     builder_config = builder.create_builder_config(
         name=args.model,
         precision=args.dtype,
@@ -246,7 +255,8 @@ def build_gpt(args):
         quant_mode=quant_mode,
         use_refit=False,
         opt_level=build_config['builder_opt'],
-        strongly_typed=strongly_typed)
+        strongly_typed=strongly_typed,
+        **builder_config_extra_kwargs)
     engine_name = get_engine_name(args.model, args.dtype, world_size,
                                   runtime_rank)
 
@@ -360,7 +370,8 @@ def build_gpt(args):
         }
         config = PretrainedConfig.from_dict(config)
         tensorrt_llm_model = tensorrt_llm.models.LLaMAForCausalLM(config)
-
+        tensorrt_llm_model = optimize_model(tensorrt_llm_model,
+                                            use_fused_mlp=True)
     elif family == "gptj":
         config = {
             'architecture': 'GPTJForCausalLM',
@@ -524,7 +535,9 @@ def build_gpt(args):
         }
         config = PretrainedConfig.from_dict(config)
         tensorrt_llm_model = tensorrt_llm.models.BloomForCausalLM(config)
-
+        tensorrt_llm_model = optimize_model(
+            tensorrt_llm_model,
+            use_parallel_embedding=config.use_parallel_embedding)
     elif family == "falcon":
         config = {
             'architecture':
@@ -666,32 +679,45 @@ def build_gpt(args):
 
     elif family == "qwen":
         config = {
-            'architecture': 'QWenForCausalLM',
-            'dtype': args.dtype,
-            'num_hidden_layers': build_config['num_layers'],
-            'num_attention_heads': build_config['num_heads'],
-            'hidden_size': build_config['hidden_size'],
-            'intermediate_size': build_config['inter_size'],
-            'num_key_value_heads': num_kv_heads,
-            'vocab_size': build_config['vocab_size'],
-            'position_embedding_type': 'rope_gpt_neox',
-            'max_position_embeddings': build_config['n_positions'],
-            'hidden_act': build_config['hidden_act'],
-            'rotary_base': 10000.0,
-            'norm_epsilon': 1e-06,
+            'architecture':
+            'QWenForCausalLM',
+            'dtype':
+            args.dtype,
+            'num_hidden_layers':
+            build_config['num_layers'],
+            'num_attention_heads':
+            build_config['num_heads'],
+            'num_key_value_heads':
+            build_config['num_heads'] if build_config['num_kv_heads'] is None
+            else build_config['num_kv_heads'],
+            'hidden_size':
+            build_config['hidden_size'],
+            'intermediate_size':
+            build_config['inter_size'],
+            'vocab_size':
+            build_config['vocab_size'],
+            'position_embedding_type':
+            'rope_gpt_neox',
+            'max_position_embeddings':
+            build_config['n_positions'],
+            'hidden_act':
+            build_config['hidden_act'],
             'quantization': {
+                'group_size': 128,
                 'quant_algo': quant_algo,
-                'kv_cache_quant_algo': kv_cache_quant_algo,
-                'group_size': 128
+                'kv_cache_quant_algo': kv_cache_quant_algo
             },
             'mapping': {
                 'world_size': world_size,
-                'tp_size': world_size,
+                'tp_size': world_size
             },
+            'moe_num_experts':
+            build_config["moe_num_experts"],
+            'moe_top_k':
+            build_config["moe_top_k"],
         }
         config = PretrainedConfig.from_dict(config)
         tensorrt_llm_model = tensorrt_llm.models.QWenForCausalLM(config)
-
     elif family == "mamba":
         config = {
             'architecture': 'MambaLMHeadModel',
@@ -716,10 +742,6 @@ def build_gpt(args):
     else:
         raise Exception(f'Unexpected model: {args.model}')
 
-    if family in ['llama']:
-        tensorrt_llm_model = optimize_model(tensorrt_llm_model,
-                                            use_fused_mlp=True)
-
     # Module -> Network
     network = builder.create_network()
     network.trt_network.name = engine_name
@@ -1225,14 +1247,21 @@ def build_enc_dec(args):
 def main(args):
     logger.set_level(args.log_level)
     if args.model in get_allowed_models(benchmark_type="gpt"):
-        build_gpt(args)
+        engine = build_gpt(args)[0]
+        engine_size = engine.nbytes
     elif args.model in get_allowed_models(benchmark_type="bert"):
-        build_bert(args)
+        engine = build_bert(args)[0]
+        engine_size = engine.nbytes
     elif args.model in get_allowed_models(benchmark_type="enc_dec"):
-        build_enc_dec(args)
+        encoder_engine, decoder_engine = build_enc_dec(args)[:2]
+        engine_size = encoder_engine.nbytes + decoder_engine.nbytes
     else:
         raise Exception(f'Unexpected model: {args.model}')
 
+    # Print engine size for CI/CD to track.
+    logger.info(
+        f"Total engine size per GPU is {engine_size / 1048576:.2f} MiB.")
+
 
 if __name__ == '__main__':
     mp.set_start_method('spawn')

diff --git a/benchmarks/python/enc_dec_benchmark.py b/benchmarks/python/enc_dec_benchmark.py
@@ -204,6 +204,12 @@ def read_config(component):
             self.decoder_runtime_mapping,
         )
 
+        # Print context memory size for CI/CD to track.
+        context_mem_size = self.encoder_session.context_mem_size + self.decoder_session.context_mem_size
+        print(
+            f"Allocated {context_mem_size / 1048576.0:.2f} MiB for execution context memory."
+        )
+
     def get_config(self):
         if 'whisper' in self.model_name:
             print(

diff --git a/benchmarks/python/gpt_benchmark.py b/benchmarks/python/gpt_benchmark.py
@@ -16,6 +16,7 @@
 from dataclasses import asdict
 from math import ceil
 
+import pandas as pd
 import torch
 
 import tensorrt_llm
@@ -93,6 +94,7 @@ def __init__(self, args, batch_sizes, in_out_lens, rank, world_size):
             # Plugins
             self.use_gpt_attention_plugin = False
             self.remove_input_padding = False
+            self.use_mamba_conv1d_plugin = False
             if args.mode == 'plugin':
                 self.use_gpt_attention_plugin = True
                 self.remove_input_padding = True
@@ -129,6 +131,7 @@ def __init__(self, args, batch_sizes, in_out_lens, rank, world_size):
             remove_input_padding=self.remove_input_padding,
             quant_mode=self.quant_mode,
             use_custom_all_reduce=self.use_custom_all_reduce,
+            mamba_conv1d_plugin=self.use_mamba_conv1d_plugin,
         )
         if args.model == 'chatglm_6b':
             self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
@@ -177,6 +180,12 @@ def __init__(self, args, batch_sizes, in_out_lens, rank, world_size):
                 self.runtime_mapping,
                 cuda_graph_mode=self.cuda_graph_mode)
 
+        # Print context memory size for CI/CD to track.
+        context_mem_size = self.decoder.context_mem_size
+        print(
+            f"Allocated {context_mem_size / 1048576.0:.2f} MiB for execution context memory."
+        )
+
     def get_config(self):
         for inlen, outlen in self.in_out_lens:
             if inlen > self.max_input_len or outlen > self.max_output_len:
@@ -338,3 +347,56 @@ def report(self,
                 kv_pairs = [f"{k} {v}" for k, v in report_dict.items()]
                 line = '[BENCHMARK] ' + " ".join(kv_pairs)
                 print(line)
+
+        if benchmark_profiler is not None and benchmark_profiler.is_recording_perf_profile:
+            perf_profile_data = self.decoder.profiler.results
+            if not perf_profile_data:
+                tensorrt_llm.logger.error("profiler data is empty")
+                return
+
+            ctx_layers = list()
+            generation_layers = list()
+            start = 0
+            ctx_iter_cnt = 0
+            generation_iter_cnt = 0
+
+            # split context/generations layer information
+            for idx, layer_info in enumerate(perf_profile_data):
+                if layer_info[0] == "step":
+                    if layer_info[1] == 0:
+                        ctx_layers.extend(perf_profile_data[start:idx])
+                        ctx_iter_cnt += 1
+                    else:
+                        generation_layers.extend(perf_profile_data[start:idx])
+                        generation_iter_cnt += 1
+                        start = idx + 1
+
+            # Reduce all data
+            def reduce_layer_data(layers):
+                layer_infos = dict()
+                for layer in layers:
+                    if layer[0] in layer_infos:
+                        layer_infos[layer[0]] += layer[1]
+                    else:
+                        layer_infos[layer[0]] = layer[1]
+                return layer_infos
+
+            # Dump kernel data
+            def dump_kernel_profile_table(name: str, profile_data: list,
+                                          iter_cnt: int):
+                table = pd.DataFrame(
+                    [[k, '{:0.3f}'.format(v)] for k, v in profile_data.items()],
+                    columns=['{} Phase LayerName'.format(name), 'times (ms)'])
+
+                def ljust(s):
+                    s = s.astype(str).str.strip()
+                    return s.str.ljust(s.str.len().max())
+
+                print(table.apply(ljust).to_string(index=False, justify='left'))
+                print("{} phase step iter: {}".format(name, iter_cnt))
+
+            ctx_layer_infos = reduce_layer_data(ctx_layers)
+            generation_layer_infos = reduce_layer_data(generation_layers)
+            dump_kernel_profile_table("Context", ctx_layer_infos, ctx_iter_cnt)
+            dump_kernel_profile_table("Generation", generation_layer_infos,
+                                      generation_iter_cnt)
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		*.a filter=lfs diff=lfs merge=lfs -text
		*.lib filter=lfs diff=lfs merge=lfs -text