From d668f18f4a2b93c92d9d63f8f6d14ab3f075ec0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Gadzi=C5=84ski?=
 <62263673+pggPL@users.noreply.github.com>
Date: Mon, 24 Feb 2025 14:50:49 +0100
Subject: [PATCH 1/2] [Pytorch] Added missing assert_dim_for_fp8_exec for
 Linear

* fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

* reshape inp

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

---------

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 transformer_engine/pytorch/module/linear.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
index e51513630f..bae21eebfd 100644
--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -27,6 +27,7 @@
     divide,
     init_method_constant,
     non_tn_fp8_gemm_supported,
+    assert_dim_for_fp8_exec,
     nvtx_range_pop,
     nvtx_range_push,
     requires_grad,
@@ -118,13 +119,14 @@ def forward(
         # Prepare input tensor
         # Note: Cast to expected dtype and perform tensor-parallel communication
         nvtx_range_push(f"{nvtx_label}.input_cast_comm")
-        inputmat = inp
+        inputmat = inp.view(-1, in_features)
         inputmat_total = None
         with_input_all_gather_nccl = (
             parallel_mode == "column" and sequence_parallel and not ub_overlap_ag_fprop
         )
         own_quantized_input = False
         if fp8:
+            assert_dim_for_fp8_exec(inputmat, weight)
             if (
                 any([ub_overlap_ag_fprop, ub_overlap_rs_fprop])
                 and not FP8GlobalStateManager.get_fp8_recipe().delayed()

From 229dd04537abebfea998d56fe518a9a9bf70b483 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Gadzi=C5=84ski?=
 <62263673+pggPL@users.noreply.github.com>
Date: Mon, 24 Feb 2025 14:57:51 +0100
Subject: [PATCH 2/2] [PyTorch] Run all Python tests, even if one of them fails

* non-exit tests

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

* fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

* fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 qa/L0_pytorch_unittest/test.sh             | 34 ++++++++++++----------
 qa/L1_pytorch_distributed_unittest/test.sh | 17 ++++++-----
 2 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh
index dd7f95bce0..6915d618f0 100644
--- a/qa/L0_pytorch_unittest/test.sh
+++ b/qa/L0_pytorch_unittest/test.sh
@@ -2,22 +2,26 @@
 #
 # See LICENSE for license information.
 
-set -e
 
 : ${TE_PATH:=/opt/transformerengine}
 
 pip install pytest==8.2.1
-pytest -v -s $TE_PATH/tests/pytorch/test_sanity.py
-pytest -v -s $TE_PATH/tests/pytorch/test_recipe.py
-pytest -v -s $TE_PATH/tests/pytorch/test_deferred_init.py
-PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_numerics.py
-NVTE_CUDNN_MXFP8_NORM=0 PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_cuda_graphs.py
-pytest -v -s $TE_PATH/tests/pytorch/test_jit.py
-pytest -v -s $TE_PATH/tests/pytorch/test_fused_rope.py
-pytest -v -s $TE_PATH/tests/pytorch/test_float8tensor.py
-pytest -v -s $TE_PATH/tests/pytorch/test_gqa.py
-pytest -v -s $TE_PATH/tests/pytorch/test_fused_optimizer.py
-pytest -v -s $TE_PATH/tests/pytorch/test_multi_tensor.py
-pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops.py
-pytest -v -s $TE_PATH/tests/pytorch/test_permutation.py
-NVTE_TORCH_COMPILE=0 NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 pytest -o log_cli=true --log-cli-level=INFO -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py
+
+FAIL=0
+
+pytest -v -s $TE_PATH/tests/pytorch/test_sanity.py || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/test_recipe.py || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/test_deferred_init.py || FAIL=1
+PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_numerics.py || FAIL=1
+NVTE_CUDNN_MXFP8_NORM=0 PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_cuda_graphs.py || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/test_jit.py || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/test_fused_rope.py || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/test_float8tensor.py || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/test_gqa.py || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/test_fused_optimizer.py || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/test_multi_tensor.py || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops.py || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/test_permutation.py || FAIL=1
+NVTE_TORCH_COMPILE=0 NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 pytest -o log_cli=true --log-cli-level=INFO -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py || FAIL=1
+
+exit $FAIL
diff --git a/qa/L1_pytorch_distributed_unittest/test.sh b/qa/L1_pytorch_distributed_unittest/test.sh
index 8ee0be1af5..5e3823d85c 100644
--- a/qa/L1_pytorch_distributed_unittest/test.sh
+++ b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -2,14 +2,17 @@
 #
 # See LICENSE for license information.
 
-set -e
-
 : ${TE_PATH:=/opt/transformerengine}
 
 pip install pytest==8.2.1
-pytest -v -s $TE_PATH/tests/pytorch/distributed/test_numerics.py
-pytest -v -s $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py
-pytest -v -s $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py
-pytest -v -s $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py
+
+FAIL=0
+
+pytest -v -s $TE_PATH/tests/pytorch/distributed/test_numerics.py || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py || FAIL=1
 # pytest -v -s $TE_PATH/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py  ### TODO Debug UB support with te.Sequential
-pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn_with_cp.py
+pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn_with_cp.py || FAIL=1
+
+exit $FAIL