openxla
diff --git a/‎BUILD
+910 b/‎BUILD
+910
diff --git a/‎cmake/llvm-hash.txt
+1-1 b/‎cmake/llvm-hash.txt
+1-1
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td
-2 b/‎include/triton/Dialect/Triton/IR/TritonOps.td
-2
diff --git a/‎lib/Analysis/AxisInfo.cpp
+1-1 b/‎lib/Analysis/AxisInfo.cpp
+1-1
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp
+4-2 b/‎lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp
+4-2
diff --git a/‎lib/Dialect/Triton/IR/Ops.cpp
-23 b/‎lib/Dialect/Triton/IR/Ops.cpp
-23
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp
+5 b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp
+5
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+24 b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+24
diff --git a/‎lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
+16-1 b/‎lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
+16-1
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Prefetch.cpp
+24-2 b/‎lib/Dialect/TritonGPU/Transforms/Prefetch.cpp
+24-2
diff --git a/‎python/BUILD
+78 b/‎python/BUILD
+78
diff --git a/‎python/test/regression/BUILD
+26 b/‎python/test/regression/BUILD
+26
@@ -1 +1 @@
-fa57c7a6a5f594a9e3ae2dbe3542cf89a20cdd73
+bef3b54ea10a564a2de72f658f2efd64f537c079
@@ -108,8 +108,6 @@ def TT_FpToFpOp : TT_Op<"fp_to_fp", [SameOperandsAndResultShape,
     let assemblyFormat = "$src attr-dict  (`,` `rounding` `=` $rounding^)? `:` type($src) `->` type($result)";
 
     let hasVerifier = 1;
-
-    let hasFolder = 1;
 }
 
 //
 
@@ -937,7 +937,7 @@ class ShROpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
       // Treat [2^n,2^n+1,...]'s divisibility as 1 instead of 2^n
       lhsDivisibility = 1;
     }
-    return std::max<int64_t>(1, lhsDivisibility / (1 << shift));
+    return std::max<int64_t>(1, lhsDivisibility / (int64_t(1) << shift));
   }
 
   int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
 
@@ -59,7 +59,8 @@ SmallVector<Value> reorderValues(const SmallVector<Value> &values, Type inType,
   auto ouEltTy = ouTensorTy.getElementType();
   if (inBitWidth == ouBitWidth)
     return values;
-  if (inBitWidth == 16 && ouBitWidth == 32) {
+  if ((inBitWidth == 16 && ouBitWidth == 32) ||
+      (inBitWidth == 32 && ouBitWidth == 16)) {
     // Register layout conversion:
     //
     //   [0, 1], [4, 5]  ⟶  [0], [1], [4], [5]
@@ -85,7 +86,8 @@ SmallVector<Value> reorderValues(const SmallVector<Value> &values, Type inType,
     }
     return ret;
   }
-  if (inBitWidth == 8 && ouBitWidth == 16) {
+  if ((inBitWidth == 8 && ouBitWidth == 16) ||
+      (inBitWidth == 16 && ouBitWidth == 8)) {
     // Register layout conversion:
     //
     //   [0, 1, 2, 3], [8, 9, 10, 11]  ⟶  [0, 1], [2, 3], [8, 9], [10, 11]
 
@@ -728,29 +728,6 @@ LogicalResult ReshapeOp::verify() {
 }
 
 //-- FpToFpOp --
-
-// Fold FpToFpOp when the input operand is a constant zero.
-OpFoldResult FpToFpOp::fold(FoldAdaptor adaptor) {
-  auto srcVal = getSrc();
-  auto dstTy = getType();
-
-  const llvm::fltSemantics &semantic =
-      llvm::cast<FloatType>(dstTy.getElementType()).getFloatSemantics();
-
-  if (matchPattern(srcVal, m_PosZeroFloat())) {
-    llvm::APFloat posZero =
-        llvm::APFloat::getZero(semantic, /*negative=*/false);
-    return DenseFPElementsAttr::get(dstTy, posZero);
-  }
-
-  if (matchPattern(srcVal, m_NegZeroFloat())) {
-    llvm::APFloat negZero = llvm::APFloat::getZero(semantic, /*negative=*/true);
-    return DenseFPElementsAttr::get(dstTy, negZero);
-  }
-
-  return {};
-}
-
 LogicalResult FpToFpOp::verify() {
   auto dstType = getType().getElementType();
   auto srcType = getSrc().getType().getElementType();
 
@@ -2801,6 +2801,11 @@ struct CanonicalizeConvertFromAlloc
     auto convert = op.getSrc().getDefiningOp<ConvertLayoutOp>();
     if (!convert)
       return failure();
+    // LocalAllocOp lowering doesn't support going from DotOperandEncoding
+    // to SharedEncoding, so we want to keep this layout conversion.
+    if (mlir::isa<triton::gpu::DotOperandEncodingAttr>(
+            convert.getSrc().getType().getEncoding()))
+      return failure();
     rewriter.replaceOpWithNewOp<triton::gpu::LocalAllocOp>(
         op, op->getResult(0).getType(), convert.getSrc());
     return mlir::success();
 
@@ -162,6 +162,21 @@ static Value getSharedMemoryMMAOperand(Value v, mlir::PatternRewriter &rewriter,
   auto newType = MemDescType::get(argType.getShape(), argType.getElementType(),
                                   newLayout, SharedMemorySpace);
   rewriter.setInsertionPointAfterValue(arg);
+
+  // LocalAllocOp lowering doesn't support going from DotOperandEncoding
+  // to SharedEncoding.
+  if (auto dotOpEnc = mlir::dyn_cast<DotOperandEncodingAttr>(
+          argType.getEncoding())) {
+    // Create a layout conversion from DotOperandEncoding to BlockedEncoding
+    // then pass it to the LocalAllocOp.
+    auto newArgType = RankedTensorType::get(
+        argType.getShape(), argType.getElementType(), dotOpEnc.getParent());
+    auto dotOperandToBlockedCvt =
+        rewriter.create<ConvertLayoutOp>(arg.getLoc(), newArgType, arg);
+    return rewriter.create<LocalAllocOp>(arg.getLoc(), newType,
+                                              dotOperandToBlockedCvt);
+  }
+
   return rewriter.create<LocalAllocOp>(arg.getLoc(), newType, arg);
 }
 
@@ -171,6 +186,15 @@ class BlockedToMMA : public mlir::OpRewritePattern<DotOp> {
   mutable llvm::DenseMap<Operation *, unsigned> dotOpInstNs;
 
   static bool bwdFilter(Operation *op) {
+    // Dot operand layout assignment to Predicates are not currently supported
+    // during lowering from TritonGPU to LLVM in Triton for MMA cases. This
+    // condition limits visibility of the original bit-width so that predicate
+    // are not considered, hence, kwidth can never be = 32.
+    if (isa<arith::UIToFPOp>(op)) {
+      Type srcType = getElementTypeOrSelf(op->getOperand(0));
+      if (srcType.isInteger(1))
+        return false;
+    }
     return op->getNumOperands() == 1 &&
            (isa<FpToFpOp, BitcastOp, ConvertLayoutOp>(op) ||
             isPureUnaryInlineAsm(op) ||
 
@@ -111,7 +111,8 @@ class HoistLayoutConversion : public OpRewritePattern<ConvertLayoutOp> {
                                 PatternRewriter &rewriter) const override {
     // Only consider conversions to dot operand.
     auto cvtTy = cast<RankedTensorType>(cvt.getType());
-    if (!isa<DotOperandEncodingAttr>(cvtTy.getEncoding()))
+    auto dotOpEnc = dyn_cast<DotOperandEncodingAttr>(cvtTy.getEncoding());
+    if (!dotOpEnc)
       return failure();
 
     auto src = cvt.getSrc().getDefiningOp();
@@ -126,6 +127,12 @@ class HoistLayoutConversion : public OpRewritePattern<ConvertLayoutOp> {
                 [](Type ty) { return isa<RankedTensorType>(ty); }))
       return failure();
 
+    // Quick handling to fix loading issues when computing the original
+    // bitwidth is unable to realize that there is a mixed-precision dot
+    // (hence kWidth = 1) but wants to hoist through the type conversion.
+    if (isa<arith::ExtFOp>(src) && dotOpEnc.getKWidth() == 1)
+        return failure();
+
     // Only consider custom conversions or arith ops.
     // TODO(jlebar): Is this too restrictive?
     if (!isa<FpToFpOp, BitcastOp>(src) && !isPureUnaryInlineAsm(src) &&
@@ -138,6 +145,14 @@ class HoistLayoutConversion : public OpRewritePattern<ConvertLayoutOp> {
     if (isa<arith::TruncIOp, arith::TruncFOp, arith::SelectOp>(src))
       return failure();
 
+    // Don't hoist through u1 -> fp casts as they aren't supported in
+    // ElementwiseOpToLLVM::reorderValues().
+    if (isa<arith::UIToFPOp>(src)) {
+      Type srcType = getElementTypeOrSelf(src->getOperand(0));
+      if (srcType.isInteger(1))
+        return failure();
+    }
+
     // Check that the conversion is transitively dependent on a load, and all
     // operations between the load and the conversion are layout preserving.
     //
 
@@ -116,7 +116,7 @@ Value Prefetcher::generatePrefetch(Value v, unsigned opIdx, bool isPrologue,
   // opIdx: 0 => a, 1 => b
   auto type = cast<triton::MemDescType>(v.getType());
   SmallVector<int64_t> shape{type.getShape().begin(), type.getShape().end()};
-  SmallVector<int64_t> offset{0, 0};
+  SmallVector<int64_t> offset(shape.size(), 0);
   Type elementType = type.getElementType();
 
   // k => (prefetchWidth, k - prefetchWidth)
@@ -140,8 +140,14 @@ Value Prefetcher::generatePrefetch(Value v, unsigned opIdx, bool isPrologue,
                                type.getMemorySpace()),
       v, offsetsVal);
 
+  // We need to assign kwidth to zero in the case where the parent layout is
+  // Blocked, otherwise the verifier emits a failure. The parent layout is
+  // Blocked only when Tensor Cores are disabled.
+  int kwidth = dyn_cast<triton::gpu::BlockedEncodingAttr>(dotEncoding)
+                   ? 0
+                   : prefetchWidth / 8;
   auto dotOperandEnc = triton::gpu::DotOperandEncodingAttr::get(
-      builder.getContext(), opIdx, dotEncoding, prefetchWidth / 8);
+      builder.getContext(), opIdx, dotEncoding, kwidth);
   Value prefetchSlice = builder.create<triton::gpu::LocalLoadOp>(
       v.getLoc(), RankedTensorType::get(shape, elementType, dotOperandEnc),
       newSmem);
@@ -190,6 +196,22 @@ LogicalResult Prefetcher::initialize() {
         break;
       if (!op->getResult(0).hasOneUse())
         break;
+      // Similar to issues faced in HoistLayoutConversion pattern in
+      // OptimizeDotOperands.cpp, we can't propagate through type casts from
+      // predicates as they aren't supported in Triton when encoded with dot_op
+      // layout.
+      if (isa<arith::UIToFPOp>(op)) {
+        Type srcType = getElementTypeOrSelf(op->getOperand(0));
+        if (srcType.isInteger(1))
+          break;
+      }
+      // Propagation through ExpandDims is currently not supported. This blindly
+      // replaces the encoding with dot encoding & but ExpandDims requires a
+      // SliceEncoding. This could be rewritten to support it somehow, but I
+      // don't think it's trivial & it's currently crashing.
+      if (isa<ExpandDimsOp>(op)) {
+        break;
+      }
       rets.push_back(op->getOperand(0));
       if (auto cvt = dyn_cast<triton::gpu::LocalLoadOp>(op)) {
         foundConvertFromShared = true;
 
@@ -0,0 +1,78 @@
+# NOTE: Do not depend on any targets from this directory,
+# but use //third_party/py/triton instead.
+
+load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
+
+package(
+    default_applicable_licenses = ["//:license"],
+    default_visibility = [
+        "//third_party/py/triton:__pkg__",
+        "@triton//python:__subpackages__",
+    ],
+)
+
+cc_library(
+    name = "passes",
+    hdrs = ["src/passes.h"],
+    includes = ["src"],
+    visibility = ["@triton//third_party:__subpackages__"],
+)
+
+pybind_extension(
+    name = "libtriton",
+    srcs = [
+        "src/interpreter.cc",
+        "src/ir.cc",
+        "src/llvm.cc",
+        "src/main.cc",
+        "src/passes.cc",
+    ],
+    copts = ["-DTRITON_BACKENDS_TUPLE=(nvidia)"],
+    deps = [
+        ":passes",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:IPO",
+        "@llvm-project//llvm:IRReader",
+        "@llvm-project//llvm:InstCombine",
+        "@llvm-project//llvm:Linker",
+        "@llvm-project//llvm:MC",
+        "@llvm-project//llvm:Passes",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:Target",
+        "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
+        "@llvm-project//mlir:BytecodeWriter",
+        "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:ConversionPasses",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:IndexDialect",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMIRTransforms",
+        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:NVVMToLLVMIRTranslation",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:ToLLVMIRTranslation",
+        "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:UBDialect",
+        "//:TritonAnalysis",
+        "//:TritonDialects",
+        "//:TritonGPUToLLVM",
+        "//:TritonGPUTransforms",
+        "//:TritonHSACO",
+        "//:TritonLLVMIR",
+        "//:TritonNvidiaGPUTransforms",
+        "//:TritonPTX",
+        "//:TritonToTritonGPU",
+        "//:TritonTools",
+        "//:TritonTransforms",
+        "@triton//third_party/nvidia:triton_nvidia",
+    ],
+)
+
+filegroup(
+    name = "files",
+    srcs = glob(
+        include = ["triton/**/*.py"],
+    ),
+)
@@ -0,0 +1,26 @@
+load("//third_party/py/pytest:pytest_defs.bzl", "pytest_multi_tests")
+
+package(
+    default_applicable_licenses = ["//:license"],
+)
+
+pytest_multi_tests(
+    name = "tests",
+    size = "large",
+    srcs = ["conftest.py"],
+    shard_count = 10,
+    tags = [
+        "config-cuda-only",
+        "requires-gpu-sm80",
+    ],
+    tests = glob(
+        include = ["test_*.py"],
+        exclude = [
+            "test_performance.py",  #TODO(b/321005767): fix failing test
+        ],
+    ),
+    deps = [
+        "//third_party/py/torch:pytorch",
+        "//third_party/py/triton",
+    ],
+)
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-fa57c7a6a5f594a9e3ae2dbe3542cf89a20cdd73`
	`1`	`+bef3b54ea10a564a2de72f658f2efd64f537c079`
Original file line number	Diff line number	Diff line change
`@@ -108,8 +108,6 @@ def TT_FpToFpOp : TT_Op<"fp_to_fp", [SameOperandsAndResultShape,`
`108`	`108`	let assemblyFormat = "$src attr-dict (`,` `rounding` `=` $rounding^)? `:` type($src) `->` type($result)";
`109`	`109`
`110`	`110`	`let hasVerifier = 1;`
`111`		`-`
`112`		`- let hasFolder = 1;`
`113`	`111`	`}`
`114`	`112`
`115`	`113`	`//`
Original file line number	Diff line number	Diff line change
`@@ -937,7 +937,7 @@ class ShROpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {`
`937`	`937`	`// Treat [2^n,2^n+1,...]'s divisibility as 1 instead of 2^n`
`938`	`938`	`lhsDivisibility = 1;`
`939`	`939`	`}`
`940`		`- return std::max<int64_t>(1, lhsDivisibility / (1 << shift));`
	`940`	`+ return std::max<int64_t>(1, lhsDivisibility / (int64_t(1) << shift));`
`941`	`941`	`}`
`942`	`942`
`943`	`943`	`int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,`