intel · etiotto · Jan 29, 2025 · Jan 30, 2025 · Jan 30, 2025 · Jan 30, 2025
diff --git a/test/Triton/Intel/RaiseToBlockPointers/addptr_for_expand_ptr.mlir b/test/Triton/Intel/RaiseToBlockPointers/addptr_for_expand_ptr.mlir
@@ -1,6 +1,4 @@
 // RUN: triton-opt %s -triton-raise-block-pointer -canonicalize | FileCheck %s
-// XFAIL: *
-// TODO: add support for tt.expand_dims in loops
 
 module {
   tt.func @kernel(
@@ -56,18 +54,13 @@ module {
 }
 
 // CHECK:         tt.func @kernel([[PARAM_0_:%.+]]: !tt.ptr<bf16>) {
-// CHECK-DAG:       [[CST_2_:%.+]] = arith.constant 2 : index
-// CHECK-DAG:       [[CST_3_:%.+]] = arith.constant 3 : index
-// CHECK-DAG:       [[CST_1024_:%.+]] = arith.constant 1024 : index
-// CHECK-DAG:       [[CST_0_:%.+]] = arith.constant 0 : index
-// CHECK-DAG:       [[CST_12_:%.+]] = arith.constant 12 : index
-// CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:       [[VAR_0_:%.+]] = scf.for [[VAR_arg1_:%.+]] = [[CST_0_]] to [[CST_12_]] step [[CST_3_]] iter_args([[VAR_arg2_:%.+]] = [[CST_1024_]]) -> (index) {
-// CHECK-DAG:         [[VAR_1_:%.+]] = tts.make_tptr [[PARAM_0_]] to sizes: [256, 256], strides: {{.}}[[CST_2_]], 1], offsets: {{.}}[[VAR_arg2_]], 256], shape: [0, 0], order: [] : <bf16> to tensor<256x256x!tt.ptr<bf16>>
-// CHECK:             [[VAR_2_:%.+]] = "tts.load"([[VAR_1_]]) <{operandSegmentSizes = array<i32: 1, 0, 0>, static_mask_dims = array<i64>}> : (tensor<256x256x!tt.ptr<bf16>>) -> tensor<256x256xbf16>
-// CHECK:             "tts.store"([[VAR_1_]], [[VAR_2_]]) <{static_mask_dims = array<i64>}> : (tensor<256x256x!tt.ptr<bf16>>, tensor<256x256xbf16>) -> ()
-// CHECK:             [[VAR_3_:%.+]] = arith.addi [[VAR_arg2_]], [[CST_3_]] : index
-// CHECK:             scf.yield [[VAR_3_]] : index
+// CHECK-DAG:       [[VAR_0_:%.+]] = tt.splat [[PARAM_0_]] : !tt.ptr<bf16> -> tensor<256x!tt.ptr<bf16>>
+// CHECK-DAG:       [[VAR_1_:%.+]] = tt.make_range {end = 1280 : i32, start = 1024 : i32} : tensor<256xi32>
+// CHECK:           [[VAR_2_:%.+]] = tt.addptr [[VAR_0_]], [[VAR_1_]] : tensor<256x!tt.ptr<bf16>>, tensor<256xi32>
+// CHECK:           [[VAR_3_:%.+]] = scf.for [[VAR_arg1_:%.+]] = {{.*}} iter_args([[VAR_arg2_:%.+]] = [[VAR_2_]]) -> (tensor<256x!tt.ptr<bf16>>) {
+// CHECK-NOT:         tt.make_tensor_ptr
+// CHECK-NOT:         tt.advance
+// CHECK:             scf.yield {{.*}} : tensor<256x!tt.ptr<bf16>>
 // CHECK:           }
 // CHECK:           tt.return
 // CHECK:         }
diff --git a/test/Triton/Intel/RaiseToBlockPointers/tensor_indices_loop_iterarg_with_masks.mlir b/test/Triton/Intel/RaiseToBlockPointers/tensor_indices_loop_iterarg_with_masks.mlir
@@ -1,7 +1,5 @@
 // RUN: triton-opt %s -triton-raise-block-pointer -canonicalize | FileCheck %s
-// XFAIL: *
 
-// IR from python/examples/test_tensor_index_iterargs.py
 module {
   tt.func public @addptr_with_masks(%arg0: !tt.ptr<f32>, %arg1: !tt.ptr<f32>, %arg2: i32) attributes {noinline = false} {
     %cst = arith.constant dense<-1.100000e+01> : tensor<4xf32>
@@ -16,7 +14,9 @@ module {
     %4:2 = scf.for %arg3 = %c0_i32 to %c4_i32 step %c1_i32 iter_args(%arg4 = %0, %arg5 = %0) -> (tensor<4xi32>, tensor<4xi32>)  : i32 {
       %5 = arith.cmpi slt, %arg4, %1 : tensor<4xi32>
       %6 = tt.addptr %2, %arg4 : tensor<4x!tt.ptr<f32>>, tensor<4xi32>
-      %7 = tt.load %6, %5, %cst : tensor<4x!tt.ptr<f32>>
+      // TODO: replace with the following line when masked loads are supported.
+      // %7 = tt.load %6, %5, %cst : tensor<4x!tt.ptr<f32>>
+      %7 = tt.load %6 : tensor<4x!tt.ptr<f32>>
       %8 = tt.addptr %3, %arg5 : tensor<4x!tt.ptr<f32>>, tensor<4xi32>
       tt.store %8, %7 : tensor<4x!tt.ptr<f32>>
       %9 = arith.addi %arg4, %cst_0 : tensor<4xi32>
@@ -28,26 +28,11 @@ module {
 }
 
 // CHECK:         tt.func public @addptr_with_masks([[PARAM_0_:%.+]]: !tt.ptr<f32>, [[PARAM_1_:%.+]]: !tt.ptr<f32>, [[PARAM_2_:%.+]]: i32) attributes {noinline = false} {
-// CHECK-DAG:       [[CST_minus_1_dot_100000_:%.+]] = arith.constant -1.100000e+01 : f32
-// CHECK-DAG:       [[CST_4_:%.+]] = arith.constant 4 : index
-// CHECK-DAG:       [[CST_1_:%.+]] = arith.constant 1 : i32
-// CHECK-DAG:       [[CST_4_1_:%.+]] = arith.constant 4 : i32
-// CHECK-DAG:       [[CST_0_:%.+]] = arith.constant 0 : i32
-// CHECK-DAG:       [[CST_0_1_:%.+]] = arith.constant 0 : index
-// CHECK-DAG:       [[CST_1_1_:%.+]] = arith.constant 1 : index
-// CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:       [[VAR_0_:%.+]]:2 = scf.for [[VAR_arg3_:%.+]] = [[CST_0_]] to [[CST_4_1_]] step [[CST_1_]] iter_args([[VAR_arg4_:%.+]] = [[CST_0_1_]], [[VAR_arg5_:%.+]] = [[CST_0_1_]]) -> (index, index)  : i32 {
-// CHECK-DAG:         [[VAR_1_:%.+]] = tts.make_tptr [[PARAM_0_]] to sizes: [4], strides: {{.}}[[CST_1_1_]]{{.}}, offsets: {{.}}[[VAR_arg4_]]{{.}}, shape: [0], order: [] : <f32> to tensor<4x!tt.ptr<f32>>
-// CHECK-DAG:         [[VAR_2_:%.+]] = arith.addi [[VAR_arg4_]], [[CST_4_]] : index
-// CHECK-DAG:         [[VAR_3_:%.+]] = arith.index_cast [[PARAM_2_]] : i32 to index
-// CHECK:             [[VAR_4_:%.+]] = arith.minsi [[VAR_2_]], [[VAR_3_]] : index
-// CHECK:             [[VAR_5_:%.+]] = arith.maxsi [[VAR_4_]], [[VAR_arg4_]] : index
-// CHECK:             [[VAR_6_:%.+]] = arith.subi [[VAR_5_]], [[VAR_arg4_]] : index
-// CHECK-DAG:         [[VAR_7_:%.+]] = "tts.load"([[VAR_1_]], [[VAR_6_]], [[CST_minus_1_dot_100000_]]) <{operandSegmentSizes = array<i32: 1, 1, 1>, static_mask_dims = array<i64: -9223372036854775808>}> : (tensor<4x!tt.ptr<f32>>, index, f32) -> tensor<4xf32>
-// CHECK-DAG:         [[VAR_8_:%.+]] = tts.make_tptr [[PARAM_1_]] to sizes: [4], strides: {{.}}[[CST_1_1_]]{{.}}, offsets: {{.}}[[VAR_arg5_]]{{.}}, shape: [0], order: [] : <f32> to tensor<4x!tt.ptr<f32>>
-// CHECK:             "tts.store"([[VAR_8_]], [[VAR_7_]]) <{static_mask_dims = array<i64>}> : (tensor<4x!tt.ptr<f32>>, tensor<4xf32>) -> ()
-// CHECK:             [[VAR_9_:%.+]] = arith.addi [[VAR_arg5_]], [[CST_4_]] : index
-// CHECK:             scf.yield [[VAR_2_]], [[VAR_9_]] : index, index
+// CHECK-DAG:       [[VAR_0_:%.+]] = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
+// CHECK:           [[VAR_1_:%.+]]:2 = scf.for [[VAR_arg3_:%.+]] = {{.*}} iter_args([[VAR_arg4_:%.+]] = [[VAR_0_]], [[VAR_arg5_:%.+]] = [[VAR_0_]]) -> (tensor<4xi32>, tensor<4xi32>)  : i32 {
+// CHECK-NOT:         tt.make_tensor_ptr
+// CHECK-NOT:         tt.advance
+// CHECK:             scf.yield {{.*}}, {{.*}} : tensor<4xi32>, tensor<4xi32>
 // CHECK:           }
 // CHECK:           tt.return
 // CHECK:         }
diff --git a/...ton/Intel/RaiseToBlockPointers/tensor_indices_loop_iterargs_not_used_ptranalysis_e2e.mlir b/...ton/Intel/RaiseToBlockPointers/tensor_indices_loop_iterargs_not_used_ptranalysis_e2e.mlir
@@ -0,0 +1,31 @@
+// RUN: triton-opt %s -triton-raise-block-pointer -canonicalize | FileCheck %s
+
+module {
+  tt.func public @test_1(%arg0: !tt.ptr<f32>) attributes {noinline = false} {
+    %c1_i32 = arith.constant 1 : i32
+    %c2_i32 = arith.constant 2 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst = arith.constant dense<4> : tensor<4xi32>
+    %0 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
+    %1 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<4x!tt.ptr<f32>>
+    %2:2 = scf.for %arg1 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg2 = %0, %arg3 = %0) -> (tensor<4xi32>, tensor<4xi32>)  : i32 {
+      %3 = tt.addptr %1, %arg2 : tensor<4x!tt.ptr<f32>>, tensor<4xi32>
+      %4 = arith.sitofp %arg3 : tensor<4xi32> to tensor<4xf32>
+      tt.store %3, %4 : tensor<4x!tt.ptr<f32>>
+      %5 = arith.addi %arg2, %cst : tensor<4xi32>
+      %6 = arith.addi %arg3, %cst : tensor<4xi32>
+      scf.yield %5, %6 : tensor<4xi32>, tensor<4xi32>
+    }
+    tt.return
+  }
+}
+
+// CHECK:         tt.func public @test_1([[PARAM_0_:.+]]: !tt.ptr<f32>) attributes {noinline = false} {
+// CHECK-DAG:       [[VAR_0_:%.+]] = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
+// CHECK:           [[VAR_1_:%.+]]:2 = scf.for [[VAR_arg1_:%.+]] = {{.*}} iter_args([[VAR_arg2_:%.+]] = [[VAR_0_]], [[VAR_arg3_:%.+]] = [[VAR_0_]]) -> (tensor<4xi32>, tensor<4xi32>)  : i32 {
+// CHECK-NOT:         tt.make_tensor_ptr
+// CHECK-NOT:         tt.advance
+// CHECK:             scf.yield {{.*}}, {{.*}} : tensor<4xi32>, tensor<4xi32>
+// CHECK:           }
+// CHECK:           tt.return
+// CHECK:         }
diff --git a/third_party/intel/lib/TritonRaiseBlockPointer/TritonRaiseBlockPointer.cpp b/third_party/intel/lib/TritonRaiseBlockPointer/TritonRaiseBlockPointer.cpp
@@ -110,7 +110,7 @@ Value getFinalValue(Value value) {
   Operation *defOp = value.getDefiningOp();
   if (!defOp) {
     // look init values outside the loop
-    BlockArgument blockArg = dyn_cast<BlockArgument>(value);
+    auto blockArg = cast<BlockArgument>(value);
     Operation *parentOp = blockArg.getOwner()->getParentOp();
     if (scf::ForOp forOp = dyn_cast<scf::ForOp>(parentOp))
       return getFinalValue(forOp.getInitArgs()[blockArg.getArgNumber() - 1]);
@@ -654,7 +654,7 @@ struct TritonRaiseBlockPointer
     Operation *defOp = value.getDefiningOp();
     if (!defOp) {
       // look init values outside the loop
-      BlockArgument blockArg = dyn_cast<BlockArgument>(value);
+      auto blockArg = cast<BlockArgument>(value);
       Operation *parentOp = blockArg.getOwner()->getParentOp();
       scf::ForOp forOp = dyn_cast<scf::ForOp>(parentOp);
       return forOp ? hasExpandOpInDefiningPath(