From d19a6ace4c1ac09ae715dc82086d00bbe6abab52 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 29 Jan 2025 02:23:46 +0000 Subject: [PATCH 1/2] Relax dim constraint in MXFP8 tests Dims are multiples of 32 instead of 128. Signed-off-by: Tim Moon --- tests/pytorch/distributed/run_numerics.py | 18 +++---- tests/pytorch/distributed/test_fusible_ops.py | 8 +-- tests/pytorch/test_fusible_ops.py | 50 ++++++++----------- 3 files changed, 33 insertions(+), 43 deletions(-) diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py index 39fbd265e7..492ff15f67 100644 --- a/tests/pytorch/distributed/run_numerics.py +++ b/tests/pytorch/distributed/run_numerics.py @@ -79,6 +79,8 @@ def main(argv=None, namespace=None): parser.add_argument("--quantization", type=str, default=None) args = parser.parse_args(argv, namespace) + QUANTIZATION = args.quantization + test_dict = [ test_linear, test_layernorm, @@ -87,14 +89,6 @@ def main(argv=None, namespace=None): test_transformer_layer, ] - # Quantization scheme - QUANTIZATION = args.quantization - if QUANTIZATION == "mxfp8": - global SEQ_LEN, BATCH_SIZE, HIDDEN_SIZE - SEQ_LEN = 64 - BATCH_SIZE = 64 - HIDDEN_SIZE = 256 - for test in test_dict: test() dist.destroy_process_group() @@ -575,7 +569,9 @@ def _test_layernorm_mlp(set_parallel_mode=None, sequence_parallel=False, **kwarg """ # Set parameter data type params_dtype = kwargs.get("params_dtype", torch.float32) - FFN_HIDDEN_SIZE = {None: 32, "fp8": 64, "mxfp8": 256}[QUANTIZATION] + FFN_HIDDEN_SIZE = ( + 64 if QUANTIZATION in ("fp8", "mxfp8") else 32 + ) # larger tensors lead to numerical failures with tight atol and rtol # Create models model_single_node = te.LayerNormMLP(HIDDEN_SIZE, FFN_HIDDEN_SIZE, **kwargs) @@ -665,7 +661,9 @@ def test_layernorm_mlp(): @run_distributed_test() def _test_transformer_layer_parallel(sequence_parallel=False, **kwargs): params_dtype = kwargs.get("params_dtype", torch.float32) - FFN_HIDDEN_SIZE = {None: 32, "fp8": 64, "mxfp8": 256}[QUANTIZATION] + FFN_HIDDEN_SIZE = ( + 64 if QUANTIZATION in ("fp8", "mxfp8") else 32 + ) # larger tensors lead to numerical failures with tight atol and rtol model_single_node = te.TransformerLayer( HIDDEN_SIZE, FFN_HIDDEN_SIZE, NR_HEADS, attention_dropout=0, hidden_dropout=0, **kwargs diff --git a/tests/pytorch/distributed/test_fusible_ops.py b/tests/pytorch/distributed/test_fusible_ops.py index 11a7df5852..d5be2a969e 100644 --- a/tests/pytorch/distributed/test_fusible_ops.py +++ b/tests/pytorch/distributed/test_fusible_ops.py @@ -315,8 +315,8 @@ def _test_reduce_scatter( def _test_basic_linear( *, - local_weight_shape: tuple[int, int] = (128, 128), - local_batch_size: int = 128, + local_weight_shape: tuple[int, int] = (16, 16), + local_batch_size: int = 16, dtype: torch.dtype = torch.float32, device: torch.device = "cuda", quantization: Optional[str] = None, @@ -459,8 +459,8 @@ def _test_basic_linear( def _test_linear( *, bias: bool = True, - local_weight_shape: tuple[int, int] = (128, 128), - local_batch_size: int = 128, + local_weight_shape: tuple[int, int] = (16, 16), + local_batch_size: int = 16, dtype: torch.dtype = torch.float32, device: torch.device = "cuda", quantization: Optional[str] = None, diff --git a/tests/pytorch/test_fusible_ops.py b/tests/pytorch/test_fusible_ops.py index b2bd623ad8..c35ba71b15 100644 --- a/tests/pytorch/test_fusible_ops.py +++ b/tests/pytorch/test_fusible_ops.py @@ -64,8 +64,8 @@ def maybe_skip_quantization( if math.prod(dims[:-1]) % 16 != 0 or dims[-1] % 16 != 0: pytest.skip("FP8 GEMMs require dims that are divisible by 16") elif quantization == "mxfp8": - if math.prod(dims[:-1]) % 128 != 0 or dims[-1] % 128 != 0: - pytest.skip("FP8 GEMMs require dims that are divisible by 128") + if math.prod(dims[:-1]) % 16 != 0 or dims[-1] % 16 != 0: + pytest.skip("MXFP8 GEMMs require dims that are divisible by 16") # Check if device is supported if device is not None and torch.device(device).type != "cuda": @@ -368,6 +368,7 @@ def test_fp8_scale_update( def test_dtype_cast( self, *, + size: int = 16, init_dtype: torch.dtype, final_dtype: torch.dtype, device: torch.device = "cuda", @@ -379,11 +380,6 @@ def test_dtype_cast( maybe_skip_quantization(quantization, device=device) with_quantization = quantization is not None - # Data dimensions - size = 16 - if quantization == "mxfp8": - size = 128 - # Random data dtype = torch.float32 if torch.float16 in (init_dtype, final_dtype): @@ -437,6 +433,7 @@ def test_dtype_cast( def test_pyt_autocast( self, *, + size: int = 16, model_dtype: torch.dtype, autocast_dtype: torch.dtype, device: torch.device = "cuda", @@ -450,11 +447,6 @@ def test_pyt_autocast( quantized_compute = quantization is not None maybe_skip_quantization(quantization) - # Data dimensions - size = 16 - if quantization == "mxfp8": - size = 128 - # Construct operation recipe = make_recipe(quantization) with te.fp8_model_init(enabled=quantized_weights, recipe=recipe): @@ -692,7 +684,7 @@ def test_bias( def test_quantize( self, *, - in_shape: Iterable[int] = (128, 128), + in_shape: Iterable[int] = (16, 16), dtype: torch.dtype = torch.bfloat16, device: torch.device = "cuda", quantization: str, @@ -859,8 +851,8 @@ def _test_basic_linear( ) torch.testing.assert_close(dw_test, w_ref.grad, **tols) - @pytest.mark.parametrize("weight_shape", ((128, 128), (3, 5))) - @pytest.mark.parametrize("in_shape", ((-1,), (5, 1, -1), (4, 4, 8, -1))) + @pytest.mark.parametrize("weight_shape", ((48, 16), (3, 5))) + @pytest.mark.parametrize("in_shape", ((-1,), (5, 1, -1), (2, 2, 4, -1))) @pytest.mark.parametrize("dtype", _dtypes) @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8")) @pytest.mark.parametrize("accumulate_into_main_grad", (False, True)) @@ -921,8 +913,8 @@ def test_linear( self, *, bias: bool, - weight_shape: tuple[int, int] = (128, 128), - in_shape: Iterable[int] = (128, -1), + weight_shape: tuple[int, int] = (16, 16), + in_shape: Iterable[int] = (16, -1), dtype: torch.dtype = torch.float32, device: torch.device = "cuda", quantization: Optional[str], @@ -1012,8 +1004,8 @@ def test_linear( db_test = op.bias.grad.to(dtype=torch.float64, device="cpu") torch.testing.assert_close(db_test, b_ref.grad, **tols) - @pytest.mark.parametrize("weight_shape", ((7, 2), (128,))) - @pytest.mark.parametrize("in_shape", ((-1,), (6, 64, -1))) + @pytest.mark.parametrize("weight_shape", ((7, 2), (16,))) + @pytest.mark.parametrize("in_shape", ((-1,), (6, 8, -1))) @pytest.mark.parametrize("dtype", _dtypes) @pytest.mark.parametrize("zero_centered_gamma", (False, True)) @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8")) @@ -1182,8 +1174,8 @@ def test_layer_norm_autocast( torch.testing.assert_close(dw_test, w_ref.grad, **dtype_tols(dtype)) torch.testing.assert_close(db_test, b_ref.grad, **dtype_tols(dtype)) - @pytest.mark.parametrize("weight_shape", ((19,), (128,))) - @pytest.mark.parametrize("in_shape", ((-1,), (6, 64, -1))) + @pytest.mark.parametrize("weight_shape", ((19,), (64,))) + @pytest.mark.parametrize("in_shape", ((-1,), (6, 8, -1))) @pytest.mark.parametrize("dtype", _dtypes) @pytest.mark.parametrize("zero_centered_gamma", (False, True)) @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8")) @@ -1395,7 +1387,7 @@ def test_make_extra_output( torch.testing.assert_close(dx_test, x_ref.grad, **tols) @pytest.mark.parametrize("activation", ("relu", "gelu", "geglu", "reglu", "swiglu")) - @pytest.mark.parametrize("out_shape", ((37,), (2, 13), (128, 1, 128))) + @pytest.mark.parametrize("out_shape", ((37,), (2, 13), (4, 1, 16))) @pytest.mark.parametrize("dtype", _dtypes) @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8")) def test_activation( @@ -1491,7 +1483,7 @@ def test_activation( def test_swiglu( self, *, - out_shape: Iterable[int] = (128, 128), + out_shape: Iterable[int] = (16, 16), dtype: torch.dtype, device: torch.device = "cuda", quantization: Optional[str], @@ -1560,8 +1552,8 @@ def setup_class(cls) -> None: torch.manual_seed(seed) torch.cuda.manual_seed(seed) - @pytest.mark.parametrize("weight_shape", ((128, 128), (3, 5))) - @pytest.mark.parametrize("in_shape", ((-1,), (1, 7, -1), (128, -1))) + @pytest.mark.parametrize("weight_shape", ((32, 48), (3, 5))) + @pytest.mark.parametrize("in_shape", ((-1,), (1, 7, -1), (4, 2, 10, -1))) @pytest.mark.parametrize("dtype", _dtypes) @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8")) @pytest.mark.parametrize("quantized_weight", (False, True)) @@ -1678,8 +1670,8 @@ def test_forward_linear_bias_add( self, *, bias: bool, - weight_shape: tuple[int, int] = (128, 128), - in_shape: Iterable[int] = (128, -1), + weight_shape: tuple[int, int] = (16, 16), + in_shape: Iterable[int] = (16, -1), dtype: torch.dtype, device: torch.device = "cuda", quantization: Optional[str], @@ -1791,8 +1783,8 @@ def test_forward_linear_bias_add( def test_backward_linear_add( self, *, - weight_shape: tuple[int, int] = (128, 128), - in_shape: Iterable[int] = (128, -1), + weight_shape: tuple[int, int] = (16, 16), + in_shape: Iterable[int] = (16, -1), dtype: torch.dtype, device: torch.device = "cuda", quantization: Optional[str], From 4129b3765cab6b8add03346a9df434bd0faf1eda Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 31 Jan 2025 01:37:09 +0000 Subject: [PATCH 2/2] Make tensor dims multiples of 32 Signed-off-by: Tim Moon --- tests/pytorch/distributed/run_numerics.py | 12 +++---- tests/pytorch/test_fusible_ops.py | 40 +++++++++++------------ 2 files changed, 24 insertions(+), 28 deletions(-) diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py index 492ff15f67..7e3a9bb39b 100644 --- a/tests/pytorch/distributed/run_numerics.py +++ b/tests/pytorch/distributed/run_numerics.py @@ -23,8 +23,8 @@ ) from run_layer_with_overlap import _compare_tensors -SEQ_LEN, BATCH_SIZE = 16, 16 -HIDDEN_SIZE = 64 +SEQ_LEN, BATCH_SIZE = 32, 32 +HIDDEN_SIZE = 128 NR_HEADS = 4 WORLD_RANK, WORLD_SIZE = None, None NCCL_WORLD = None @@ -569,9 +569,7 @@ def _test_layernorm_mlp(set_parallel_mode=None, sequence_parallel=False, **kwarg """ # Set parameter data type params_dtype = kwargs.get("params_dtype", torch.float32) - FFN_HIDDEN_SIZE = ( - 64 if QUANTIZATION in ("fp8", "mxfp8") else 32 - ) # larger tensors lead to numerical failures with tight atol and rtol + FFN_HIDDEN_SIZE = 128 # Create models model_single_node = te.LayerNormMLP(HIDDEN_SIZE, FFN_HIDDEN_SIZE, **kwargs) @@ -661,9 +659,7 @@ def test_layernorm_mlp(): @run_distributed_test() def _test_transformer_layer_parallel(sequence_parallel=False, **kwargs): params_dtype = kwargs.get("params_dtype", torch.float32) - FFN_HIDDEN_SIZE = ( - 64 if QUANTIZATION in ("fp8", "mxfp8") else 32 - ) # larger tensors lead to numerical failures with tight atol and rtol + FFN_HIDDEN_SIZE = 128 model_single_node = te.TransformerLayer( HIDDEN_SIZE, FFN_HIDDEN_SIZE, NR_HEADS, attention_dropout=0, hidden_dropout=0, **kwargs diff --git a/tests/pytorch/test_fusible_ops.py b/tests/pytorch/test_fusible_ops.py index c35ba71b15..4506fb628d 100644 --- a/tests/pytorch/test_fusible_ops.py +++ b/tests/pytorch/test_fusible_ops.py @@ -64,8 +64,8 @@ def maybe_skip_quantization( if math.prod(dims[:-1]) % 16 != 0 or dims[-1] % 16 != 0: pytest.skip("FP8 GEMMs require dims that are divisible by 16") elif quantization == "mxfp8": - if math.prod(dims[:-1]) % 16 != 0 or dims[-1] % 16 != 0: - pytest.skip("MXFP8 GEMMs require dims that are divisible by 16") + if math.prod(dims[:-1]) % 32 != 0 or dims[-1] % 32 != 0: + pytest.skip("MXFP8 GEMMs require dims that are divisible by 32") # Check if device is supported if device is not None and torch.device(device).type != "cuda": @@ -368,7 +368,7 @@ def test_fp8_scale_update( def test_dtype_cast( self, *, - size: int = 16, + size: int = 32, init_dtype: torch.dtype, final_dtype: torch.dtype, device: torch.device = "cuda", @@ -433,7 +433,7 @@ def test_dtype_cast( def test_pyt_autocast( self, *, - size: int = 16, + size: int = 32, model_dtype: torch.dtype, autocast_dtype: torch.dtype, device: torch.device = "cuda", @@ -684,7 +684,7 @@ def test_bias( def test_quantize( self, *, - in_shape: Iterable[int] = (16, 16), + in_shape: Iterable[int] = (32, 32), dtype: torch.dtype = torch.bfloat16, device: torch.device = "cuda", quantization: str, @@ -851,8 +851,8 @@ def _test_basic_linear( ) torch.testing.assert_close(dw_test, w_ref.grad, **tols) - @pytest.mark.parametrize("weight_shape", ((48, 16), (3, 5))) - @pytest.mark.parametrize("in_shape", ((-1,), (5, 1, -1), (2, 2, 4, -1))) + @pytest.mark.parametrize("weight_shape", ((64, 32), (3, 5))) + @pytest.mark.parametrize("in_shape", ((-1,), (5, 1, -1), (4, 2, 4, -1))) @pytest.mark.parametrize("dtype", _dtypes) @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8")) @pytest.mark.parametrize("accumulate_into_main_grad", (False, True)) @@ -913,8 +913,8 @@ def test_linear( self, *, bias: bool, - weight_shape: tuple[int, int] = (16, 16), - in_shape: Iterable[int] = (16, -1), + weight_shape: tuple[int, int] = (32, 32), + in_shape: Iterable[int] = (32, -1), dtype: torch.dtype = torch.float32, device: torch.device = "cuda", quantization: Optional[str], @@ -1004,8 +1004,8 @@ def test_linear( db_test = op.bias.grad.to(dtype=torch.float64, device="cpu") torch.testing.assert_close(db_test, b_ref.grad, **tols) - @pytest.mark.parametrize("weight_shape", ((7, 2), (16,))) - @pytest.mark.parametrize("in_shape", ((-1,), (6, 8, -1))) + @pytest.mark.parametrize("weight_shape", ((7, 2), (32,))) + @pytest.mark.parametrize("in_shape", ((-1,), (6, 16, -1))) @pytest.mark.parametrize("dtype", _dtypes) @pytest.mark.parametrize("zero_centered_gamma", (False, True)) @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8")) @@ -1175,7 +1175,7 @@ def test_layer_norm_autocast( torch.testing.assert_close(db_test, b_ref.grad, **dtype_tols(dtype)) @pytest.mark.parametrize("weight_shape", ((19,), (64,))) - @pytest.mark.parametrize("in_shape", ((-1,), (6, 8, -1))) + @pytest.mark.parametrize("in_shape", ((-1,), (6, 16, -1))) @pytest.mark.parametrize("dtype", _dtypes) @pytest.mark.parametrize("zero_centered_gamma", (False, True)) @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8")) @@ -1387,7 +1387,7 @@ def test_make_extra_output( torch.testing.assert_close(dx_test, x_ref.grad, **tols) @pytest.mark.parametrize("activation", ("relu", "gelu", "geglu", "reglu", "swiglu")) - @pytest.mark.parametrize("out_shape", ((37,), (2, 13), (4, 1, 16))) + @pytest.mark.parametrize("out_shape", ((37,), (2, 13), (32, 1, 32))) @pytest.mark.parametrize("dtype", _dtypes) @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8")) def test_activation( @@ -1483,7 +1483,7 @@ def test_activation( def test_swiglu( self, *, - out_shape: Iterable[int] = (16, 16), + out_shape: Iterable[int] = (32, 32), dtype: torch.dtype, device: torch.device = "cuda", quantization: Optional[str], @@ -1552,8 +1552,8 @@ def setup_class(cls) -> None: torch.manual_seed(seed) torch.cuda.manual_seed(seed) - @pytest.mark.parametrize("weight_shape", ((32, 48), (3, 5))) - @pytest.mark.parametrize("in_shape", ((-1,), (1, 7, -1), (4, 2, 10, -1))) + @pytest.mark.parametrize("weight_shape", ((32, 64), (3, 5))) + @pytest.mark.parametrize("in_shape", ((-1,), (1, 7, -1), (8, 2, 10, -1))) @pytest.mark.parametrize("dtype", _dtypes) @pytest.mark.parametrize("quantization", (None, "fp8", "mxfp8")) @pytest.mark.parametrize("quantized_weight", (False, True)) @@ -1670,8 +1670,8 @@ def test_forward_linear_bias_add( self, *, bias: bool, - weight_shape: tuple[int, int] = (16, 16), - in_shape: Iterable[int] = (16, -1), + weight_shape: tuple[int, int] = (32, 32), + in_shape: Iterable[int] = (32, -1), dtype: torch.dtype, device: torch.device = "cuda", quantization: Optional[str], @@ -1783,8 +1783,8 @@ def test_forward_linear_bias_add( def test_backward_linear_add( self, *, - weight_shape: tuple[int, int] = (16, 16), - in_shape: Iterable[int] = (16, -1), + weight_shape: tuple[int, int] = (32, 32), + in_shape: Iterable[int] = (32, -1), dtype: torch.dtype, device: torch.device = "cuda", quantization: Optional[str],