diff --git a/docs/debug.rst b/docs/debug.rst index d33568ea3b..25d29848e3 100644 --- a/docs/debug.rst +++ b/docs/debug.rst @@ -11,4 +11,5 @@ Precision debug tools debug/1_getting_started.rst debug/2_config_file_structure.rst debug/api - debug/4_distributed.rst \ No newline at end of file + debug/4_distributed.rst + debug/5_custom_feature_tutorial.ipynb \ No newline at end of file diff --git a/docs/debug/5_custom_feature_tutorial.ipynb b/docs/debug/5_custom_feature_tutorial.ipynb new file mode 100644 index 0000000000..0e658f7b99 --- /dev/null +++ b/docs/debug/5_custom_feature_tutorial.ipynb @@ -0,0 +1,632 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "85a94734", + "metadata": {}, + "source": [ + "# Adding custom feature to precision debug tools\n", + " \n", + "TE comes with several built-in features, such as `LogFp8TensorStats`, which can log statistics for each tensor involved in matrix multiplication (GEMM) operations.\n", + "In this tutorial, we'll demonstrate how to extend TE by adding a custom feature that logs the percentage of elements in a tensor whose absolute values exceed a configurable threshold `t`, as specified in the config file.\n", + "\n", + "Custom features can be used for example for:\n", + "\n", + "1. Logging custom statistics.\n", + "2. Dumping intermediate tensors.\n", + "3. Experiments with modifying intermediate tensors.\n", + "\n", + "How to add custom feature:\n", + "\n", + "1. Add Python with feature class definition which inherits from `transformer_engine.debug.features.api.TEConfigAPIMapper`.\n", + "2. Wrap the class with `@Registry.register_feature(namespace=\"transformer_engine\")`.\n", + "3. Implement some of API calls to nvidia-dl-framework-inspect described [here](../3_api_te_calls.rst).\n", + "\n", + "Let's define a new file at `.../custom_feature_dir/percentage_greater_than_threshold.py` containing the following code:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b4e7562d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n",
+       "#\n",
+       "# See LICENSE for license information.\n",
+       "\n",
+       """"PercentageGreaterThanThreshold Feature support for nvidia-dlframework-inspect"""\n",
+       "\n",
+       "from typing import Dict, Optional\n",
+       "\n",
+       "import torch\n",
+       "\n",
+       "from nvdlfw_inspect.registry import Registry, api_method\n",
+       "from nvdlfw_inspect.logging import MetricLogger\n",
+       "import nvdlfw_inspect.api as debug_api\n",
+       "\n",
+       "from transformer_engine.debug.features.api import TEConfigAPIMapper\n",
+       "from transformer_engine.pytorch.tensor import QuantizedTensor, Quantizer\n",
+       "\n",
+       "\n",
+       "# Class should inherit from TEConfigAPIMapper and be registered to the transformer_engine namespace.\n",
+       "@Registry.register_feature(namespace="transformer_engine")\n",
+       "class PercentageGreaterThanThreshold(TEConfigAPIMapper):\n",
+       "\n",
+       "    @api_method\n",
+       "    def inspect_tensor(\n",
+       "        self,\n",
+       "        config: Dict,\n",
+       "        layer_name: str,\n",
+       "        tensor_name: str,\n",
+       "        iteration: int,\n",
+       "        tp_group: torch.distributed.ProcessGroup,\n",
+       "        tensor: torch.Tensor,\n",
+       "        rowwise_quantized_tensor: Optional[torch.Tensor | QuantizedTensor] = None,\n",
+       "        columnwise_quantized_tensor: Optional[torch.Tensor | QuantizedTensor] = None,\n",
+       "        quantizer: Optional[Quantizer] = None,\n",
+       "    ):\n",
+       "        # API call inspect_tensor is used to gather the data about the tensor.\n",
+       "        # All API calls are documented in the `Precision debug tools / API / Calls to Nvidia-DL-Framework-Inspect`\n",
+       "        # section of the documentation.\n",
+       "\n",
+       "        threshold = config["threshold"]\n",
+       "\n",
+       "        # Get the reduction group from the debug tool\n",
+       "        # one can set it using debug_api.set_tensor_reduction_group(group)\n",
+       "        reduction_group = debug_api.get_tensor_reduction_group()\n",
+       "\n",
+       "        # Compute percentage on local tensor\n",
+       "        count = (tensor > threshold).sum().float()\n",
+       "        total = torch.tensor(tensor.numel(), dtype=torch.float32, device=tensor.device)\n",
+       "\n",
+       "        # Perform reduction across the group if needed.\n",
+       "        # Note that we perform all_reduce twice per every tensor, which is suboptimal.\n",
+       "        # For guidance on implementing efficient statistics reduction, see the implementation in the `LogTensorStats` feature.\n",
+       "        # In this tutorial we only showcase basic implementation of the feature.\n",
+       "        if reduction_group is not None:\n",
+       "            torch.distributed.all_reduce(count, group=reduction_group)\n",
+       "            torch.distributed.all_reduce(total, group=reduction_group)\n",
+       "\n",
+       "        percentage = count / total\n",
+       "\n",
+       "        # MetricLogger is a class from nvidia-dlframework-inspect.\n",
+       "        # By using it we can also use functionalities provided by nvidia-dlframework-inspect,\n",
+       "        # like logging to TensorBoard, etc.\n",
+       "        MetricLogger.log_scalar(\n",
+       "            f"{layer_name}_{tensor_name}_percentage_greater_than_threshold", percentage, iteration\n",
+       "        )\n",
+       "\n",
+       "    @api_method\n",
+       "    def inspect_tensor_enabled(\n",
+       "        self, config: Dict, layer_name: str, tensor_name: str, iteration: int\n",
+       "    ):\n",
+       "        # This call is used by TE to determine if the unfused debug layer - which is slower - needs to be run.\n",
+       "        # It returns a tuple (bool, int), where the int indicates the next iteration when the feature will be enabled\n",
+       "        # and bool indicates if the feature should be enabled at the current iteration.\n",
+       "\n",
+       "        run_current = iteration % config["freq"] == 0\n",
+       "        # run in next multiple of freq\n",
+       "        next_iter = iteration + (config["freq"] - iteration % config["freq"])\n",
+       "        return run_current, next_iter\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{c+c1}{\\PYZsh{} Copyright (c) 2022\\PYZhy{}2025, NVIDIA CORPORATION \\PYZam{} AFFILIATES. All rights reserved.}\n", + "\\PY{c+c1}{\\PYZsh{}}\n", + "\\PY{c+c1}{\\PYZsh{} See LICENSE for license information.}\n", + "\n", + "\\PY{l+s+sd}{\\PYZdq{}\\PYZdq{}\\PYZdq{}PercentageGreaterThanThreshold Feature support for nvidia\\PYZhy{}dlframework\\PYZhy{}inspect\\PYZdq{}\\PYZdq{}\\PYZdq{}}\n", + "\n", + "\\PY{k+kn}{from}\\PY{+w}{ }\\PY{n+nn}{typing}\\PY{+w}{ }\\PY{k+kn}{import} \\PY{n}{Dict}\\PY{p}{,} \\PY{n}{Optional}\n", + "\n", + "\\PY{k+kn}{import}\\PY{+w}{ }\\PY{n+nn}{torch}\n", + "\n", + "\\PY{k+kn}{from}\\PY{+w}{ }\\PY{n+nn}{nvdlfw\\PYZus{}inspect}\\PY{n+nn}{.}\\PY{n+nn}{registry}\\PY{+w}{ }\\PY{k+kn}{import} \\PY{n}{Registry}\\PY{p}{,} \\PY{n}{api\\PYZus{}method}\n", + "\\PY{k+kn}{from}\\PY{+w}{ }\\PY{n+nn}{nvdlfw\\PYZus{}inspect}\\PY{n+nn}{.}\\PY{n+nn}{logging}\\PY{+w}{ }\\PY{k+kn}{import} \\PY{n}{MetricLogger}\n", + "\\PY{k+kn}{import}\\PY{+w}{ }\\PY{n+nn}{nvdlfw\\PYZus{}inspect}\\PY{n+nn}{.}\\PY{n+nn}{api}\\PY{+w}{ }\\PY{k}{as}\\PY{+w}{ }\\PY{n+nn}{debug\\PYZus{}api}\n", + "\n", + "\\PY{k+kn}{from}\\PY{+w}{ }\\PY{n+nn}{transformer\\PYZus{}engine}\\PY{n+nn}{.}\\PY{n+nn}{debug}\\PY{n+nn}{.}\\PY{n+nn}{features}\\PY{n+nn}{.}\\PY{n+nn}{api}\\PY{+w}{ }\\PY{k+kn}{import} \\PY{n}{TEConfigAPIMapper}\n", + "\\PY{k+kn}{from}\\PY{+w}{ }\\PY{n+nn}{transformer\\PYZus{}engine}\\PY{n+nn}{.}\\PY{n+nn}{pytorch}\\PY{n+nn}{.}\\PY{n+nn}{tensor}\\PY{+w}{ }\\PY{k+kn}{import} \\PY{n}{QuantizedTensor}\\PY{p}{,} \\PY{n}{Quantizer}\n", + "\n", + "\n", + "\\PY{c+c1}{\\PYZsh{} Class should inherit from TEConfigAPIMapper and be registered to the transformer\\PYZus{}engine namespace.}\n", + "\\PY{n+nd}{@Registry}\\PY{o}{.}\\PY{n}{register\\PYZus{}feature}\\PY{p}{(}\\PY{n}{namespace}\\PY{o}{=}\\PY{l+s+s2}{\\PYZdq{}}\\PY{l+s+s2}{transformer\\PYZus{}engine}\\PY{l+s+s2}{\\PYZdq{}}\\PY{p}{)}\n", + "\\PY{k}{class}\\PY{+w}{ }\\PY{n+nc}{PercentageGreaterThanThreshold}\\PY{p}{(}\\PY{n}{TEConfigAPIMapper}\\PY{p}{)}\\PY{p}{:}\n", + "\n", + " \\PY{n+nd}{@api\\PYZus{}method}\n", + " \\PY{k}{def}\\PY{+w}{ }\\PY{n+nf}{inspect\\PYZus{}tensor}\\PY{p}{(}\n", + " \\PY{n+nb+bp}{self}\\PY{p}{,}\n", + " \\PY{n}{config}\\PY{p}{:} \\PY{n}{Dict}\\PY{p}{,}\n", + " \\PY{n}{layer\\PYZus{}name}\\PY{p}{:} \\PY{n+nb}{str}\\PY{p}{,}\n", + " \\PY{n}{tensor\\PYZus{}name}\\PY{p}{:} \\PY{n+nb}{str}\\PY{p}{,}\n", + " \\PY{n}{iteration}\\PY{p}{:} \\PY{n+nb}{int}\\PY{p}{,}\n", + " \\PY{n}{tp\\PYZus{}group}\\PY{p}{:} \\PY{n}{torch}\\PY{o}{.}\\PY{n}{distributed}\\PY{o}{.}\\PY{n}{ProcessGroup}\\PY{p}{,}\n", + " \\PY{n}{tensor}\\PY{p}{:} \\PY{n}{torch}\\PY{o}{.}\\PY{n}{Tensor}\\PY{p}{,}\n", + " \\PY{n}{rowwise\\PYZus{}quantized\\PYZus{}tensor}\\PY{p}{:} \\PY{n}{Optional}\\PY{p}{[}\\PY{n}{torch}\\PY{o}{.}\\PY{n}{Tensor} \\PY{o}{|} \\PY{n}{QuantizedTensor}\\PY{p}{]} \\PY{o}{=} \\PY{k+kc}{None}\\PY{p}{,}\n", + " \\PY{n}{columnwise\\PYZus{}quantized\\PYZus{}tensor}\\PY{p}{:} \\PY{n}{Optional}\\PY{p}{[}\\PY{n}{torch}\\PY{o}{.}\\PY{n}{Tensor} \\PY{o}{|} \\PY{n}{QuantizedTensor}\\PY{p}{]} \\PY{o}{=} \\PY{k+kc}{None}\\PY{p}{,}\n", + " \\PY{n}{quantizer}\\PY{p}{:} \\PY{n}{Optional}\\PY{p}{[}\\PY{n}{Quantizer}\\PY{p}{]} \\PY{o}{=} \\PY{k+kc}{None}\\PY{p}{,}\n", + " \\PY{p}{)}\\PY{p}{:}\n", + " \\PY{c+c1}{\\PYZsh{} API call inspect\\PYZus{}tensor is used to gather the data about the tensor.}\n", + " \\PY{c+c1}{\\PYZsh{} All API calls are documented in the `Precision debug tools / API / Calls to Nvidia\\PYZhy{}DL\\PYZhy{}Framework\\PYZhy{}Inspect`}\n", + " \\PY{c+c1}{\\PYZsh{} section of the documentation.}\n", + "\n", + " \\PY{n}{threshold} \\PY{o}{=} \\PY{n}{config}\\PY{p}{[}\\PY{l+s+s2}{\\PYZdq{}}\\PY{l+s+s2}{threshold}\\PY{l+s+s2}{\\PYZdq{}}\\PY{p}{]}\n", + "\n", + " \\PY{c+c1}{\\PYZsh{} Get the reduction group from the debug tool}\n", + " \\PY{c+c1}{\\PYZsh{} one can set it using debug\\PYZus{}api.set\\PYZus{}tensor\\PYZus{}reduction\\PYZus{}group(group)}\n", + " \\PY{n}{reduction\\PYZus{}group} \\PY{o}{=} \\PY{n}{debug\\PYZus{}api}\\PY{o}{.}\\PY{n}{get\\PYZus{}tensor\\PYZus{}reduction\\PYZus{}group}\\PY{p}{(}\\PY{p}{)}\n", + "\n", + " \\PY{c+c1}{\\PYZsh{} Compute percentage on local tensor}\n", + " \\PY{n}{count} \\PY{o}{=} \\PY{p}{(}\\PY{n}{tensor} \\PY{o}{\\PYZgt{}} \\PY{n}{threshold}\\PY{p}{)}\\PY{o}{.}\\PY{n}{sum}\\PY{p}{(}\\PY{p}{)}\\PY{o}{.}\\PY{n}{float}\\PY{p}{(}\\PY{p}{)}\n", + " \\PY{n}{total} \\PY{o}{=} \\PY{n}{torch}\\PY{o}{.}\\PY{n}{tensor}\\PY{p}{(}\\PY{n}{tensor}\\PY{o}{.}\\PY{n}{numel}\\PY{p}{(}\\PY{p}{)}\\PY{p}{,} \\PY{n}{dtype}\\PY{o}{=}\\PY{n}{torch}\\PY{o}{.}\\PY{n}{float32}\\PY{p}{,} \\PY{n}{device}\\PY{o}{=}\\PY{n}{tensor}\\PY{o}{.}\\PY{n}{device}\\PY{p}{)}\n", + "\n", + " \\PY{c+c1}{\\PYZsh{} Perform reduction across the group if needed.}\n", + " \\PY{c+c1}{\\PYZsh{} Note that we perform all\\PYZus{}reduce twice per every tensor, which is suboptimal.}\n", + " \\PY{c+c1}{\\PYZsh{} For guidance on implementing efficient statistics reduction, see the implementation in the `LogTensorStats` feature.}\n", + " \\PY{c+c1}{\\PYZsh{} In this tutorial we only showcase basic implementation of the feature.}\n", + " \\PY{k}{if} \\PY{n}{reduction\\PYZus{}group} \\PY{o+ow}{is} \\PY{o+ow}{not} \\PY{k+kc}{None}\\PY{p}{:}\n", + " \\PY{n}{torch}\\PY{o}{.}\\PY{n}{distributed}\\PY{o}{.}\\PY{n}{all\\PYZus{}reduce}\\PY{p}{(}\\PY{n}{count}\\PY{p}{,} \\PY{n}{group}\\PY{o}{=}\\PY{n}{reduction\\PYZus{}group}\\PY{p}{)}\n", + " \\PY{n}{torch}\\PY{o}{.}\\PY{n}{distributed}\\PY{o}{.}\\PY{n}{all\\PYZus{}reduce}\\PY{p}{(}\\PY{n}{total}\\PY{p}{,} \\PY{n}{group}\\PY{o}{=}\\PY{n}{reduction\\PYZus{}group}\\PY{p}{)}\n", + "\n", + " \\PY{n}{percentage} \\PY{o}{=} \\PY{n}{count} \\PY{o}{/} \\PY{n}{total}\n", + "\n", + " \\PY{c+c1}{\\PYZsh{} MetricLogger is a class from nvidia\\PYZhy{}dlframework\\PYZhy{}inspect.}\n", + " \\PY{c+c1}{\\PYZsh{} By using it we can also use functionalities provided by nvidia\\PYZhy{}dlframework\\PYZhy{}inspect,}\n", + " \\PY{c+c1}{\\PYZsh{} like logging to TensorBoard, etc.}\n", + " \\PY{n}{MetricLogger}\\PY{o}{.}\\PY{n}{log\\PYZus{}scalar}\\PY{p}{(}\n", + " \\PY{l+s+sa}{f}\\PY{l+s+s2}{\\PYZdq{}}\\PY{l+s+si}{\\PYZob{}}\\PY{n}{layer\\PYZus{}name}\\PY{l+s+si}{\\PYZcb{}}\\PY{l+s+s2}{\\PYZus{}}\\PY{l+s+si}{\\PYZob{}}\\PY{n}{tensor\\PYZus{}name}\\PY{l+s+si}{\\PYZcb{}}\\PY{l+s+s2}{\\PYZus{}percentage\\PYZus{}greater\\PYZus{}than\\PYZus{}threshold}\\PY{l+s+s2}{\\PYZdq{}}\\PY{p}{,} \\PY{n}{percentage}\\PY{p}{,} \\PY{n}{iteration}\n", + " \\PY{p}{)}\n", + "\n", + " \\PY{n+nd}{@api\\PYZus{}method}\n", + " \\PY{k}{def}\\PY{+w}{ }\\PY{n+nf}{inspect\\PYZus{}tensor\\PYZus{}enabled}\\PY{p}{(}\n", + " \\PY{n+nb+bp}{self}\\PY{p}{,} \\PY{n}{config}\\PY{p}{:} \\PY{n}{Dict}\\PY{p}{,} \\PY{n}{layer\\PYZus{}name}\\PY{p}{:} \\PY{n+nb}{str}\\PY{p}{,} \\PY{n}{tensor\\PYZus{}name}\\PY{p}{:} \\PY{n+nb}{str}\\PY{p}{,} \\PY{n}{iteration}\\PY{p}{:} \\PY{n+nb}{int}\n", + " \\PY{p}{)}\\PY{p}{:}\n", + " \\PY{c+c1}{\\PYZsh{} This call is used by TE to determine if the unfused debug layer \\PYZhy{} which is slower \\PYZhy{} needs to be run.}\n", + " \\PY{c+c1}{\\PYZsh{} It returns a tuple (bool, int), where the int indicates the next iteration when the feature will be enabled}\n", + " \\PY{c+c1}{\\PYZsh{} and bool indicates if the feature should be enabled at the current iteration.}\n", + "\n", + " \\PY{n}{run\\PYZus{}current} \\PY{o}{=} \\PY{n}{iteration} \\PY{o}{\\PYZpc{}} \\PY{n}{config}\\PY{p}{[}\\PY{l+s+s2}{\\PYZdq{}}\\PY{l+s+s2}{freq}\\PY{l+s+s2}{\\PYZdq{}}\\PY{p}{]} \\PY{o}{==} \\PY{l+m+mi}{0}\n", + " \\PY{c+c1}{\\PYZsh{} run in next multiple of freq}\n", + " \\PY{n}{next\\PYZus{}iter} \\PY{o}{=} \\PY{n}{iteration} \\PY{o}{+} \\PY{p}{(}\\PY{n}{config}\\PY{p}{[}\\PY{l+s+s2}{\\PYZdq{}}\\PY{l+s+s2}{freq}\\PY{l+s+s2}{\\PYZdq{}}\\PY{p}{]} \\PY{o}{\\PYZhy{}} \\PY{n}{iteration} \\PY{o}{\\PYZpc{}} \\PY{n}{config}\\PY{p}{[}\\PY{l+s+s2}{\\PYZdq{}}\\PY{l+s+s2}{freq}\\PY{l+s+s2}{\\PYZdq{}}\\PY{p}{]}\\PY{p}{)}\n", + " \\PY{k}{return} \\PY{n}{run\\PYZus{}current}\\PY{p}{,} \\PY{n}{next\\PYZus{}iter}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n", + "#\n", + "# See LICENSE for license information.\n", + "\n", + "\"\"\"PercentageGreaterThanThreshold Feature support for nvidia-dlframework-inspect\"\"\"\n", + "\n", + "from typing import Dict, Optional\n", + "\n", + "import torch\n", + "\n", + "from nvdlfw_inspect.registry import Registry, api_method\n", + "from nvdlfw_inspect.logging import MetricLogger\n", + "import nvdlfw_inspect.api as debug_api\n", + "\n", + "from transformer_engine.debug.features.api import TEConfigAPIMapper\n", + "from transformer_engine.pytorch.tensor import QuantizedTensor, Quantizer\n", + "\n", + "\n", + "# Class should inherit from TEConfigAPIMapper and be registered to the transformer_engine namespace.\n", + "@Registry.register_feature(namespace=\"transformer_engine\")\n", + "class PercentageGreaterThanThreshold(TEConfigAPIMapper):\n", + "\n", + " @api_method\n", + " def inspect_tensor(\n", + " self,\n", + " config: Dict,\n", + " layer_name: str,\n", + " tensor_name: str,\n", + " iteration: int,\n", + " tp_group: torch.distributed.ProcessGroup,\n", + " tensor: torch.Tensor,\n", + " rowwise_quantized_tensor: Optional[torch.Tensor | QuantizedTensor] = None,\n", + " columnwise_quantized_tensor: Optional[torch.Tensor | QuantizedTensor] = None,\n", + " quantizer: Optional[Quantizer] = None,\n", + " ):\n", + " # API call inspect_tensor is used to gather the data about the tensor.\n", + " # All API calls are documented in the `Precision debug tools / API / Calls to Nvidia-DL-Framework-Inspect`\n", + " # section of the documentation.\n", + "\n", + " threshold = config[\"threshold\"]\n", + "\n", + " # Get the reduction group from the debug tool\n", + " # one can set it using debug_api.set_tensor_reduction_group(group)\n", + " reduction_group = debug_api.get_tensor_reduction_group()\n", + "\n", + " # Compute percentage on local tensor\n", + " count = (tensor > threshold).sum().float()\n", + " total = torch.tensor(tensor.numel(), dtype=torch.float32, device=tensor.device)\n", + "\n", + " # Perform reduction across the group if needed.\n", + " # Note that we perform all_reduce twice per every tensor, which is suboptimal.\n", + " # For guidance on implementing efficient statistics reduction, see the implementation in the `LogTensorStats` feature.\n", + " # In this tutorial we only showcase basic implementation of the feature.\n", + " if reduction_group is not None:\n", + " torch.distributed.all_reduce(count, group=reduction_group)\n", + " torch.distributed.all_reduce(total, group=reduction_group)\n", + "\n", + " percentage = count / total\n", + "\n", + " # MetricLogger is a class from nvidia-dlframework-inspect.\n", + " # By using it we can also use functionalities provided by nvidia-dlframework-inspect,\n", + " # like logging to TensorBoard, etc.\n", + " MetricLogger.log_scalar(\n", + " f\"{layer_name}_{tensor_name}_percentage_greater_than_threshold\", percentage, iteration\n", + " )\n", + "\n", + " @api_method\n", + " def inspect_tensor_enabled(\n", + " self, config: Dict, layer_name: str, tensor_name: str, iteration: int\n", + " ):\n", + " # This call is used by TE to determine if the unfused debug layer - which is slower - needs to be run.\n", + " # It returns a tuple (bool, int), where the int indicates the next iteration when the feature will be enabled\n", + " # and bool indicates if the feature should be enabled at the current iteration.\n", + "\n", + " run_current = iteration % config[\"freq\"] == 0\n", + " # run in next multiple of freq\n", + " next_iter = iteration + (config[\"freq\"] - iteration % config[\"freq\"])\n", + " return run_current, next_iter" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython.display import Code\n", + "Code(filename='./custom_feature_dir/percentage_greater_than_threshold.py', language='python')" + ] + }, + { + "cell_type": "markdown", + "id": "1fd2c750-a859-49ab-bcdd-6e7bd0d7efbb", + "metadata": {}, + "source": [ + "Let's prepare simple config file `custom_feature_example_config.yaml` and training script." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "279c93c0-6d7e-4c02-a00f-e5b40496b5fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
stats:\n",
+       "  enabled: True\n",
+       "  layers:\n",
+       "    layer_name_regex_pattern: .*\n",
+       "  transformer_engine:\n",
+       "    PercentageGreaterThanThreshold:\n",
+       "      enabled: True\n",
+       "      tensors: [activation]\n",
+       "      threshold: 0.1\n",
+       "      freq: 5\n",
+       "    LogTensorStats:\n",
+       "      enabled: True\n",
+       "      tensors: [activation]\n",
+       "      stats: [min]\n",
+       "      freq: 5\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{n}{stats}\\PY{p}{:}\n", + " \\PY{n}{enabled}\\PY{p}{:} \\PY{k+kc}{True}\n", + " \\PY{n}{layers}\\PY{p}{:}\n", + " \\PY{n}{layer\\PYZus{}name\\PYZus{}regex\\PYZus{}pattern}\\PY{p}{:} \\PY{o}{.}\\PY{o}{*}\n", + " \\PY{n}{transformer\\PYZus{}engine}\\PY{p}{:}\n", + " \\PY{n}{PercentageGreaterThanThreshold}\\PY{p}{:}\n", + " \\PY{n}{enabled}\\PY{p}{:} \\PY{k+kc}{True}\n", + " \\PY{n}{tensors}\\PY{p}{:} \\PY{p}{[}\\PY{n}{activation}\\PY{p}{]}\n", + " \\PY{n}{threshold}\\PY{p}{:} \\PY{l+m+mf}{0.1}\n", + " \\PY{n}{freq}\\PY{p}{:} \\PY{l+m+mi}{5}\n", + " \\PY{n}{LogTensorStats}\\PY{p}{:}\n", + " \\PY{n}{enabled}\\PY{p}{:} \\PY{k+kc}{True}\n", + " \\PY{n}{tensors}\\PY{p}{:} \\PY{p}{[}\\PY{n}{activation}\\PY{p}{]}\n", + " \\PY{n}{stats}\\PY{p}{:} \\PY{p}{[}\\PY{n+nb}{min}\\PY{p}{]}\n", + " \\PY{n}{freq}\\PY{p}{:} \\PY{l+m+mi}{5}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "stats:\n", + " enabled: True\n", + " layers:\n", + " layer_name_regex_pattern: .*\n", + " transformer_engine:\n", + " PercentageGreaterThanThreshold:\n", + " enabled: True\n", + " tensors: [activation]\n", + " threshold: 0.1\n", + " freq: 5\n", + " LogTensorStats:\n", + " enabled: True\n", + " tensors: [activation]\n", + " stats: [min]\n", + " freq: 5" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython.display import Code\n", + "Code(filename='./custom_feature_dir/custom_feature_example_config.yaml', language='python')" + ] + }, + { + "cell_type": "markdown", + "id": "3929f293-7ac1-48b0-8a4d-23bb6976aa0b", + "metadata": {}, + "source": [ + "To use this feature one needs to add `.../custom_feature_dir` to `debug_api.initialize(feature_dirs=...`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d82f1c82", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "NVDLFW INSPECT - 2025-10-17 14:16:42,204 - WARNING - Reduction group initialized for tensor reduction before logging statistics. If per-rank statistics are required, pass `skip_reduction=True` when invoking the API. To pass another reduction group, use `reduction_group` kwarg when invoking the API.\n" + ] + } + ], + "source": [ + "import os, time\n", + "import torch\n", + "import transformer_engine.pytorch as te\n", + "import nvdlfw_inspect.api as debug_api\n", + "\n", + "te_dir = os.environ[\"TE_PATH\"] # setup TE dir as environment variable to run this script\n", + "log_dir = os.environ.get(\"LOG_PATH\", \"./log\")\n", + "\n", + "debug_api.initialize(\n", + " config_file=te_dir + \"/docs/debug/custom_feature_dir/custom_feature_example_config.yaml\",\n", + " feature_dirs=[\n", + " te_dir + \"/transformer_engine/debug/features\", \n", + " te_dir + \"/docs/debug/custom_feature_dir\" # One needs to add path to the custom feature dir here\n", + " ],\n", + " log_dir=log_dir,\n", + " default_logging_enabled=True)\n", + "\n", + "debug_api.set_tensor_reduction_group(None) # For distributed training one needs to set the reduction group\n", + "\n", + "module = te.Linear(128, 128, name=\"linear_1\")\n", + "inp = torch.randn(128, 128).cuda()\n", + "\n", + "# Simple training loop with measuring the time\n", + "times = []\n", + "for _ in range(100):\n", + " time_start = time.time()\n", + " inp.normal_()\n", + " out = module(inp)\n", + " out.sum().backward()\n", + " torch.cuda.synchronize()\n", + " time_end = time.time()\n", + " times.append(time_end - time_start)\n", + "\n", + " debug_api.step()" + ] + }, + { + "cell_type": "markdown", + "id": "e4f129a9", + "metadata": {}, + "source": [ + "Now, let's plot the gathered stats." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b68a21ea", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from custom_feature_dir.utils import plot_stats\n", + "\n", + "plot_stats(log_dir)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/debug/custom_feature_dir/custom_feature_example_config.yaml b/docs/debug/custom_feature_dir/custom_feature_example_config.yaml new file mode 100644 index 0000000000..ab0369866f --- /dev/null +++ b/docs/debug/custom_feature_dir/custom_feature_example_config.yaml @@ -0,0 +1,15 @@ +stats: + enabled: True + layers: + layer_name_regex_pattern: .* + transformer_engine: + PercentageGreaterThanThreshold: + enabled: True + tensors: [activation] + threshold: 0.1 + freq: 5 + LogTensorStats: + enabled: True + tensors: [activation] + stats: [min] + freq: 5 \ No newline at end of file diff --git a/docs/debug/custom_feature_dir/percentage_greater_than_threshold.py b/docs/debug/custom_feature_dir/percentage_greater_than_threshold.py new file mode 100644 index 0000000000..ff5524acf3 --- /dev/null +++ b/docs/debug/custom_feature_dir/percentage_greater_than_threshold.py @@ -0,0 +1,78 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +"""PercentageGreaterThanThreshold Feature support for nvidia-dlframework-inspect""" + +from typing import Dict, Optional + +import torch + +from nvdlfw_inspect.registry import Registry, api_method +from nvdlfw_inspect.logging import MetricLogger +import nvdlfw_inspect.api as debug_api + +from transformer_engine.debug.features.api import TEConfigAPIMapper +from transformer_engine.pytorch.tensor import QuantizedTensor, Quantizer + + +# Class should inherit from TEConfigAPIMapper and be registered to the transformer_engine namespace. +@Registry.register_feature(namespace="transformer_engine") +class PercentageGreaterThanThreshold(TEConfigAPIMapper): + + @api_method + def inspect_tensor( + self, + config: Dict, + layer_name: str, + tensor_name: str, + iteration: int, + tp_group: torch.distributed.ProcessGroup, + tensor: torch.Tensor, + rowwise_quantized_tensor: Optional[torch.Tensor | QuantizedTensor] = None, + columnwise_quantized_tensor: Optional[torch.Tensor | QuantizedTensor] = None, + quantizer: Optional[Quantizer] = None, + ): + # API call inspect_tensor is used to gather the data about the tensor. + # All API calls are documented in the `Precision debug tools / API / Calls to Nvidia-DL-Framework-Inspect` + # section of the documentation. + + threshold = config["threshold"] + + # Get the reduction group from the debug tool + # one can set it using debug_api.set_tensor_reduction_group(group) + reduction_group = debug_api.get_tensor_reduction_group() + + # Compute percentage on local tensor + count = (tensor > threshold).sum().float() + total = torch.tensor(tensor.numel(), dtype=torch.float32, device=tensor.device) + + # Perform reduction across the group if needed. + # Note that we perform all_reduce twice per every tensor, which is suboptimal. + # For guidance on implementing efficient statistics reduction, see the implementation in the `LogTensorStats` feature. + # In this tutorial we only showcase basic implementation of the feature. + if reduction_group is not None: + torch.distributed.all_reduce(count, group=reduction_group) + torch.distributed.all_reduce(total, group=reduction_group) + + percentage = count / total + + # MetricLogger is a class from nvidia-dlframework-inspect. + # By using it we can also use functionalities provided by nvidia-dlframework-inspect, + # like logging to TensorBoard, etc. + MetricLogger.log_scalar( + f"{layer_name}_{tensor_name}_percentage_greater_than_threshold", percentage, iteration + ) + + @api_method + def inspect_tensor_enabled( + self, config: Dict, layer_name: str, tensor_name: str, iteration: int + ): + # This call is used by TE to determine if the unfused debug layer - which is slower - needs to be run. + # It returns a tuple (bool, int), where the int indicates the next iteration when the feature will be enabled + # and bool indicates if the feature should be enabled at the current iteration. + + run_current = iteration % config["freq"] == 0 + # run in next multiple of freq + next_iter = iteration + (config["freq"] - iteration % config["freq"]) + return run_current, next_iter diff --git a/docs/debug/custom_feature_dir/utils.py b/docs/debug/custom_feature_dir/utils.py new file mode 100644 index 0000000000..a19a3beaab --- /dev/null +++ b/docs/debug/custom_feature_dir/utils.py @@ -0,0 +1,49 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +"""Utils for plotting stats in the tutorial""" + + +import os +import matplotlib.pyplot as plt + + +def plot_stats(log_dir): + + # print and plot the stats + stat_file = os.path.join( + log_dir, "nvdlfw_inspect_statistics_logs", "nvdlfw_inspect_globalrank-0.log" + ) + + min_values = [] + custom_feature_values = [] + + with open(stat_file, "r") as f: + import re + + number_pattern = re.compile(r"[-+]?\d*\.\d+|\d+") + + for line in f: + if "min" in line: + matches = number_pattern.findall(line) + if matches: + min_values.append(float(matches[-1])) + if "percentage_greater_than_threshold" in line: + matches = number_pattern.findall(line) + if matches: + custom_feature_values.append(float(matches[-1])) + + # plot 2 figures side by side + fig, axs = plt.subplots(1, 2, figsize=(12, 5)) + + axs[0].plot(min_values, label="min") + axs[0].legend() + axs[0].set_title("Min values") + + axs[1].plot(custom_feature_values, label="percentage_greater_than_threshold_0.1") + axs[1].legend() + axs[1].set_title("Percentage greater than threshold 0.1 values") + + plt.tight_layout() + plt.show()