Merge remote-tracking branch 'origin/main' into jh/persistent_kernel_…

…impl
NVIDIA · Feb 6, 2025 · 52d2bca · 52d2bca
2 parents 4d0226c + 714f974
commit 52d2bca
Show file tree

Hide file tree

Showing 61 changed files with 2,932 additions and 738 deletions.
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -48,9 +48,10 @@ jobs:
         OPENAI__KEY: ${{ secrets.LLM_OPENAI__KEY }}
         OPENAI__API_BASE: ${{ secrets.LLM_OPENAI__API_BASE }}
         CONFIG__MODEL: ${{ secrets.LLM_CONFIG__MODEL }}
-        CONFIG__CUSTOM_MODEL_MAX_TOKENS: 131072
+        CONFIG__CUSTOM_MODEL_MAX_TOKENS: 32768
+        CONFIG__FALLBACK_MODELS: '[]'
 
-        CONFIG__MAX_MODEL_TOKENS: 65536
+        CONFIG__MAX_MODEL_TOKENS: 32768
         CONFIG__PUBLISH_OUTPUT_PROGRESS: false 
 
         PR_REVIEWER__REQUIRE_SCORE_REVIEW: false

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -201,6 +201,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/preseg_passes/remove_empty.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/reorder_sharded_axis.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/segment_inplace_update.cpp
+  ${NVFUSER_SRCS_DIR}/preseg_passes/translate_no_reduction_matmul_to_mul_squeeze.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/translate_repeat_to_expand.cpp
   ${NVFUSER_SRCS_DIR}/rng.cpp
   ${NVFUSER_SRCS_DIR}/runtime/allocations.cpp
@@ -867,6 +868,7 @@ list(APPEND NVFUSER_RUNTIME_FILES
   ${NVFUSER_ROOT}/runtime/mbarrier.cu
   ${NVFUSER_ROOT}/runtime/memory.cu
   ${NVFUSER_ROOT}/runtime/random_numbers.cu
+  ${NVFUSER_ROOT}/runtime/tensor_memory.cu
   ${NVFUSER_ROOT}/runtime/tensor.cu
   ${NVFUSER_ROOT}/runtime/tuple.cu
   ${NVFUSER_ROOT}/runtime/type_traits.cu

diff --git a/benchmarks/python/model_configs.py b/benchmarks/python/model_configs.py
@@ -0,0 +1,105 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-present NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+from functools import partial
+
+from transformers import AutoConfig
+
+
+def llama_hf_cfg(config_str):
+    class Config:
+        def __init__(
+            self, n_head, head_size, n_query_groups, rope_n_elem, batches, seq_length
+        ):
+            self.n_head = n_head
+            self.head_size = head_size
+            self.n_query_groups = n_query_groups
+            self.rope_n_elem = rope_n_elem
+            self.batches = batches
+            self.seq_length = seq_length
+
+    configs = {}
+    configs["llama_2_7b_hf"] = Config(
+        n_head=32,
+        head_size=128,
+        n_query_groups=32,
+        rope_n_elem=128,
+        batches=2,
+        seq_length=4096,
+    )
+    configs["llama_3_8B"] = Config(
+        n_head=32,
+        head_size=128,
+        n_query_groups=8,
+        rope_n_elem=128,
+        batches=2,
+        seq_length=8192,
+    )
+
+    return configs[config_str]
+
+
+def hf_qwen2_cfg():
+    config = AutoConfig.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
+    config.batch_size = 1
+    config.seq_len = 4096
+    config._attn_implementation = "sdpa"
+    return config
+
+
+def hf_phi3_cfg():
+    config = AutoConfig.from_pretrained("microsoft/Phi-3.5-mini-instruct")
+    config.batch_size = 1
+    config.seq_len = 8192
+    config._attn_implementation = "sdpa"
+    return config
+
+
+def hf_mistral_nemo_cfg():
+    import json
+    from transformers.models.mistral import MistralConfig
+
+    mistral_cfg_str = r"""{
+      "_name_or_path": "mistralai/Mistral-Nemo-Base-2407",
+      "architectures": [
+        "MistralForCausalLM"
+      ],
+      "attention_dropout": 0.0,
+      "bos_token_id": 1,
+      "eos_token_id": 2,
+      "head_dim": 128,
+      "hidden_act": "silu",
+      "hidden_size": 5120,
+      "initializer_range": 0.02,
+      "intermediate_size": 14336,
+      "max_position_embeddings": 128000,
+      "model_type": "mistral",
+      "num_attention_heads": 32,
+      "num_hidden_layers": 40,
+      "num_key_value_heads": 8,
+      "rms_norm_eps": 1e-05,
+      "rope_theta": 1000000.0,
+      "sliding_window": null,
+      "tie_word_embeddings": false,
+      "torch_dtype": "bfloat16",
+      "transformers_version": "4.43.3",
+      "use_cache": true,
+      "vocab_size": 131072
+    }
+    """
+
+    cfg = MistralConfig.from_dict(json.loads(mistral_cfg_str))
+    cfg.batch_size = 1
+    cfg.seq_len = 4096
+    cfg._attn_implementation = "sdpa"
+
+    return cfg
+
+
+configs = {
+    "llama_2_7b_hf": partial(llama_hf_cfg, config_str="llama_2_7b_hf"),
+    "llama_3_8B": partial(llama_hf_cfg, config_str="llama_3_8B"),
+    "hf_qwen2": hf_qwen2_cfg,
+    "hf_phi3": hf_phi3_cfg,
+    "hf_mistral_nemo": hf_mistral_nemo_cfg,
+}