Add Copyright to utils and fix some more greptiles complaints

tdophung · tdophung · commit b742244f3ae5 · 2025-10-28T14:19:30.000-07:00
Signed-off-by: tdophung &lt;tdophung@nvidia.com&gt;
diff --git a/docs/examples/quickstart_jax.ipynb b/docs/examples/quickstart_jax.ipynb
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "881fd001",
    "metadata": {},
    "outputs": [],
@@ -65,18 +65,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 8,
    "id": "d5284a38",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:absl:Tensorflow library not found, tensorflow.io.gfile operations will use native shim calls. GCS paths (i.e. 'gs://...') cannot be accessed.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import jax\n",
     "import jax.numpy as jnp\n",
@@ -87,7 +79,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
    "id": "a4d1cfdc",
    "metadata": {},
    "outputs": [],
@@ -181,7 +173,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 10,
    "id": "8b44649d",
    "metadata": {},
    "outputs": [],
@@ -202,15 +194,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "id": "e44ed26d",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "BasicTransformerLayer initialized successfully!\n",
+      "Pure Flax BasicTransformerLayer initialized successfully!\n",
       "Parameter shapes: {'params': {'BasicMLP_0': {'Dense_0': {'bias': (16384,), 'kernel': (4096, 16384)}, 'Dense_1': {'bias': (4096,), 'kernel': (16384, 4096)}}, 'Dense_0': {'bias': (12288,), 'kernel': (4096, 12288)}, 'Dense_1': {'bias': (4096,), 'kernel': (4096, 4096)}, 'LayerNorm_0': {'bias': (4096,), 'scale': (4096,)}, 'LayerNorm_1': {'bias': (4096,), 'scale': (4096,)}}}\n"
      ]
     }
@@ -232,7 +224,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 12,
    "id": "de91af7a",
    "metadata": {},
    "outputs": [
@@ -258,15 +250,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 13,
    "id": "037bc8d9",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Mean time: 27.949795722961426 ms\n"
+      "Mean time: 27.269372940063477 ms\n"
      ]
     }
    ],
@@ -316,7 +308,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 14,
    "id": "bed20d6b",
    "metadata": {},
    "outputs": [],
@@ -336,7 +328,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 15,
    "id": "56105579",
    "metadata": {},
    "outputs": [],
@@ -357,7 +349,7 @@
     "    hidden_size: int\n",
     "    ffn_hidden_size: int \n",
     "    num_attention_heads: int  \n",
-    "    layernorm_eps: int = 1e-5\n",
+    "    layernorm_eps: float = 1e-5\n",
     "    attention_dropout: float = 0.1 \n",
     "    hidden_dropout: float = 0.1\n",
     "\n",
@@ -433,21 +425,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "id": "5146cd99",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Basic parameter shapes: {'BasicMLP_0': {'Dense_0': {'bias': (16384,), 'kernel': (4096, 16384)}, 'Dense_1': {'bias': (4096,), 'kernel': (16384, 4096)}}, 'Dense_0': {'bias': (12288,), 'kernel': (4096, 12288)}, 'Dense_1': {'bias': (4096,), 'kernel': (4096, 4096)}, 'LayerNorm_0': {'bias': (4096,), 'scale': (4096,)}, 'LayerNorm_1': {'bias': (4096,), 'scale': (4096,)}}\n",
-      "TE parameter shapes: {'BasicTEMLP_0': {'DenseGeneral_0': {'bias': LogicallyPartitioned(value=(16384,), names=(), mesh=None, rules=None), 'kernel': LogicallyPartitioned(value=(4096, 16384), names=(), mesh=None, rules=None)}, 'DenseGeneral_1': {'bias': LogicallyPartitioned(value=(4096,), names=(), mesh=None, rules=None), 'kernel': LogicallyPartitioned(value=(16384, 4096), names=(), mesh=None, rules=None)}}, 'DenseGeneral_0': {'bias': LogicallyPartitioned(value=(12288,), names=(), mesh=None, rules=None), 'kernel': LogicallyPartitioned(value=(4096, 12288), names=(), mesh=None, rules=None)}, 'DenseGeneral_1': {'bias': LogicallyPartitioned(value=(4096,), names=(), mesh=None, rules=None), 'kernel': LogicallyPartitioned(value=(4096, 4096), names=(), mesh=None, rules=None)}, 'LayerNorm_0': {'ln_bias': LogicallyPartitioned(value=(4096,), names=('embed',), mesh=None, rules=None), 'scale': LogicallyPartitioned(value=(4096,), names=('embed',), mesh=None, rules=None)}, 'LayerNorm_1': {'ln_bias': LogicallyPartitioned(value=(4096,), names=('embed',), mesh=None, rules=None), 'scale': LogicallyPartitioned(value=(4096,), names=('embed',), mesh=None, rules=None)}}\n",
-      "Input shape: (2048, 4, 4096)\n",
-      "Output shape: (2048, 4, 4096)\n",
-      "Output dtype: bfloat16\n",
-      "Forward pass completed successfully!\n",
-      "Mean time: 17.249975204467773 ms\n"
+      "Basic TE parameter shapes: {'BasicTEMLP_0': {'DenseGeneral_0': {'bias': LogicallyPartitioned(value=(16384,), names=(), mesh=None, rules=None), 'kernel': LogicallyPartitioned(value=(4096, 16384), names=(), mesh=None, rules=None)}, 'DenseGeneral_1': {'bias': LogicallyPartitioned(value=(4096,), names=(), mesh=None, rules=None), 'kernel': LogicallyPartitioned(value=(16384, 4096), names=(), mesh=None, rules=None)}}, 'DenseGeneral_0': {'bias': LogicallyPartitioned(value=(12288,), names=(), mesh=None, rules=None), 'kernel': LogicallyPartitioned(value=(4096, 12288), names=(), mesh=None, rules=None)}, 'DenseGeneral_1': {'bias': LogicallyPartitioned(value=(4096,), names=(), mesh=None, rules=None), 'kernel': LogicallyPartitioned(value=(4096, 4096), names=(), mesh=None, rules=None)}, 'LayerNorm_0': {'ln_bias': LogicallyPartitioned(value=(4096,), names=('embed',), mesh=None, rules=None), 'scale': LogicallyPartitioned(value=(4096,), names=('embed',), mesh=None, rules=None)}, 'LayerNorm_1': {'ln_bias': LogicallyPartitioned(value=(4096,), names=('embed',), mesh=None, rules=None), 'scale': LogicallyPartitioned(value=(4096,), names=('embed',), mesh=None, rules=None)}}\n",
+      "Mean time: 17.397570610046387 ms\n"
      ]
     }
    ],
@@ -516,7 +503,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
    "id": "11203785",
    "metadata": {},
    "outputs": [],
@@ -525,7 +512,7 @@
     "    hidden_size: int\n",
     "    ffn_hidden_size: int \n",
     "    num_attention_heads: int  \n",
-    "    layernorm_eps: int = 1e-5\n",
+    "    layernorm_eps: float = 1e-5\n",
     "    attention_dropout: float = 0.1 \n",
     "    hidden_dropout: float = 0.1\n",
     "\n",
@@ -586,15 +573,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "id": "114de14f",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Basic parameter shapes: {'BasicMLP_0': {'Dense_0': {'bias': (16384,), 'kernel': (4096, 16384)}, 'Dense_1': {'bias': (4096,), 'kernel': (16384, 4096)}}, 'Dense_0': {'bias': (12288,), 'kernel': (4096, 12288)}, 'Dense_1': {'bias': (4096,), 'kernel': (4096, 4096)}, 'LayerNorm_0': {'bias': (4096,), 'scale': (4096,)}, 'LayerNorm_1': {'bias': (4096,), 'scale': (4096,)}}\n",
       "Fused TE parameter shapes: {'DenseGeneral_0': {'bias': LogicallyPartitioned(value=(4096,), names=(), mesh=None, rules=None), 'kernel': LogicallyPartitioned(value=(4096, 4096), names=(), mesh=None, rules=None)}, 'LayerNormDenseGeneral_0': {'bias': LogicallyPartitioned(value=(12288,), names=(), mesh=None, rules=None), 'kernel': LogicallyPartitioned(value=(4096, 12288), names=(), mesh=None, rules=None), 'ln_bias': LogicallyPartitioned(value=(4096,), names=('embed',), mesh=None, rules=None), 'scale': LogicallyPartitioned(value=(4096,), names=('embed',), mesh=None, rules=None)}, 'LayerNormMLP_0': {'ln_bias': LogicallyPartitioned(value=(4096,), names=('embed',), mesh=None, rules=None), 'scale': LogicallyPartitioned(value=(4096,), names=('embed',), mesh=None, rules=None), 'wi_bias': LogicallyPartitioned(value=(1, 16384), names=('act', 'mlp'), mesh=None, rules=None), 'wi_kernel': LogicallyPartitioned(value=(4096, 1, 16384), names=('embed', 'act', 'mlp'), mesh=None, rules=None), 'wo_bias': LogicallyPartitioned(value=(4096,), names=('embed',), mesh=None, rules=None), 'wo_kernel': LogicallyPartitioned(value=(16384, 4096), names=('mlp', 'embed'), mesh=None, rules=None)}}\n"
      ]
     }
@@ -617,19 +603,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
    "id": "6b0c705e",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Input shape: (2048, 4, 4096)\n",
-      "Output shape: (2048, 4, 4096)\n",
-      "Output dtype: bfloat16\n",
-      "Forward pass completed successfully!\n",
-      "Mean time: 17.9510498046875 ms\n"
+      "Mean time: 18.0991792678833 ms\n"
      ]
     }
    ],
@@ -660,15 +642,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 20,
    "id": "7496b159",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Basic parameter shapes: {'BasicMLP_0': {'Dense_0': {'bias': (16384,), 'kernel': (4096, 16384)}, 'Dense_1': {'bias': (4096,), 'kernel': (16384, 4096)}}, 'Dense_0': {'bias': (12288,), 'kernel': (4096, 12288)}, 'Dense_1': {'bias': (4096,), 'kernel': (4096, 4096)}, 'LayerNorm_0': {'bias': (4096,), 'scale': (4096,)}, 'LayerNorm_1': {'bias': (4096,), 'scale': (4096,)}}\n",
       "TE TransformerLayer parameter shapes: {'attention': {'out': {'bias': LogicallyPartitioned(value=(4096,), names=('nvte_w_no_shard',), mesh=None, rules=None), 'kernel': LogicallyPartitioned(value=(4096, 4096), names=('nvte_w_tp', 'nvte_w_fsdp'), mesh=None, rules=None)}, 'qkv': {'bias': LogicallyPartitioned(value=(3, 4096), names=('nvte_w_joined', 'nvte_w_tp'), mesh=None, rules=None), 'kernel': LogicallyPartitioned(value=(4096, 3, 4096), names=('nvte_w_fsdp', 'nvte_w_joined', 'nvte_w_tp'), mesh=None, rules=None), 'ln_bias': LogicallyPartitioned(value=(4096,), names=('nvte_w_no_shard',), mesh=None, rules=None), 'scale': LogicallyPartitioned(value=(4096,), names=('nvte_w_no_shard',), mesh=None, rules=None)}}, 'mlp': {'ln_bias': LogicallyPartitioned(value=(4096,), names=('nvte_w_no_shard',), mesh=None, rules=None), 'scale': LogicallyPartitioned(value=(4096,), names=('nvte_w_no_shard',), mesh=None, rules=None), 'wi_bias': LogicallyPartitioned(value=(1, 16384), names=('nvte_w_joined', 'nvte_w_tp'), mesh=None, rules=None), 'wi_kernel': LogicallyPartitioned(value=(4096, 1, 16384), names=('nvte_w_fsdp', 'nvte_w_joined', 'nvte_w_tp'), mesh=None, rules=None), 'wo_bias': LogicallyPartitioned(value=(4096,), names=('nvte_w_no_shard',), mesh=None, rules=None), 'wo_kernel': LogicallyPartitioned(value=(16384, 4096), names=('nvte_w_tp', 'nvte_w_fsdp'), mesh=None, rules=None)}, 'relpos_bias': {'rel_embedding': LogicallyPartitioned(value=(32, 32), names=('heads', 'relpos_buckets'), mesh=None, rules=None)}}\n"
      ]
     }
@@ -692,19 +673,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "id": "6ec0f60e",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Input shape: (2048, 4, 4096)\n",
-      "Output shape: (2048, 4, 4096)\n",
-      "Output dtype: bfloat16\n",
-      "Forward pass completed successfully!\n",
-      "Mean time: 11.953592300415039 ms\n"
+      "Mean time: 11.84274673461914 ms\n"
      ]
     }
    ],
@@ -743,7 +720,7 @@
     "\n",
     "</div>\n",
     "\n",
-    "Enabling FP8 support is very simple in Transformer Engine. We just need to wrap the modules within an [fp8_autocast](../api/pytorch.rst#transformer_engine.pytorch.fp8_autocast) context manager. Note that fp8_autocast should only be used to wrap the forward pass and must exit before starting a backward pass. See the [FP8 tutorial](fp8_primer.ipynb) (currently only available in PyTorch) for a detailed explanation of FP8 recipes and the supported options.\n",
+    "Enabling FP8 support is very simple in Transformer Engine. We just need to wrap the modules within an [fp8_autocast](.../api/jax.rst#transformer_engine.jax.fp8_autocast) context manager. Note that fp8_autocast should only be used to wrap the forward pass and must exit before starting a backward pass. See the [FP8 tutorial](fp8_primer.ipynb) (currently only available in PyTorch) for a detailed explanation of FP8 recipes and the supported options.\n",
     "\n",
     "<div class=\"alert alert-warning\">\n",
     "\n",
@@ -756,7 +733,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
    "id": "b2aaa8ef",
    "metadata": {},
    "outputs": [
@@ -803,15 +780,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 23,
    "id": "b9cdbf22",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Mean time: 7.995896339416504 ms\n"
+      "Mean time: 7.96757698059082 ms\n"
      ]
     }
    ],
@@ -834,6 +811,18 @@
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,
diff --git a/docs/examples/quickstart_jax_utils.py b/docs/examples/quickstart_jax_utils.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
 import jax
 import jax.numpy as jnp
 import time
@@ -35,15 +39,15 @@ def speedometer(
     for _ in range(warmup_iters):
         key, step_key = jax.random.split(key)
         loss, (param_grads, other_grads) = train_step_fn(
-            variables, input, output_grad, step_key, key
+            variables, input, output_grad, step_key
         )
 
     # Timing runs
     start = time.time()
     for _ in range(timing_iters):
         key, step_key = jax.random.split(key)
         loss, (param_grads, other_grads) = train_step_fn(
-            variables, input, output_grad, step_key, key
+            variables, input, output_grad, step_key
         )
     end = time.time()
 
@@ -98,7 +102,7 @@ def create_train_step_fn_vjp(
     def train_step_fn(variables: Any, inp: jnp.ndarray, grad_target: jnp.ndarray, dropout_key):
         """Compute forward pass and VJP in one step"""
 
-        # Define forward function that closes over grad_target and dropout_key
+        # Define forward function that closes over dropout_key
         def forward_fn(variables: Any, inp: jnp.ndarray):
             """Pure forward function for VJP computation"""
             rngs = {"dropout": dropout_key}