openxla
diff --git a/‎shardy/dialect/sdy/transforms/propagation/BUILD
+2-1 b/‎shardy/dialect/sdy/transforms/propagation/BUILD
+2-1
diff --git a/‎shardy/dialect/sdy/transforms/propagation/aggressive_factor_propagation.cc
+52-106 b/‎shardy/dialect/sdy/transforms/propagation/aggressive_factor_propagation.cc
+52-106
diff --git a/‎shardy/dialect/sdy/transforms/propagation/aggressive_factor_propagation.h
+32-22 b/‎shardy/dialect/sdy/transforms/propagation/aggressive_factor_propagation.h
+32-22
@@ -250,9 +250,10 @@ cc_test(
     srcs = ["aggressive_factor_propagation_test.cc"],
     deps = [
         ":aggressive_factor_propagation",
-        ":basic_factor_propagation",
+        ":factor_propagation",
         ":sharding_projection",
         ":testing_utils",
+        ":utils",
         "//shardy/dialect/sdy/ir:dialect",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
 
@@ -23,146 +23,92 @@ limitations under the License.
 #include "mlir/IR/Value.h"
 #include "mlir/Support/LLVM.h"
 #include "shardy/dialect/sdy/ir/dialect.h"
-#include "shardy/dialect/sdy/transforms/propagation/basic_factor_propagation.h"
 #include "shardy/dialect/sdy/transforms/propagation/factor_propagation.h"
 #include "shardy/dialect/sdy/transforms/propagation/sharding_projection.h"
 
 namespace mlir {
 namespace sdy {
 
-AxesPerFactor
-AggressiveFactorPropagation::getCompatibleMajorShardingAxesForAllFactors(
-    const ShardingProjection& projection, PropagationDirection direction,
+namespace {
+
+bool updateTensorSharding(ShardingProjection& projection, int64_t tensorIndex,
+                          int64_t factorIndex, ArrayRef<AxisRefAttr> newAxes) {
+  if (tensorIndex < projection.getNumOperands()) {
+    return projection.updateOperandSharding(tensorIndex, factorIndex, newAxes);
+  }
+  return projection.updateResultSharding(
+      tensorIndex - projection.getNumOperands(), factorIndex, newAxes);
+}
+
+}  // namespace
+
+UpdateTensorShardings AggressiveFactorPropagation::propagateFactorShardings(
+    ShardingProjection& projection, PropagationDirection direction,
     ArrayRef<int64_t> factorSizes, MeshAttr mesh, Operation* op,
     bool conservativePropagation) const {
+  UpdateTensorShardings result{
+      .updateOperands = BitVector(projection.getNumOperands()),
+      .updateResults = BitVector(projection.getNumResults())};
   if (direction == PropagationDirection::NONE) {
-    return AxesPerFactor(factorSizes.size());
+    return result;
   }
 
-  // Finds the compatible major axes ignoring conflicts.
-  AxesPerFactor result;
-  result.reserve(factorSizes.size());
+  // Step 1. Find the compatible major axes ignoring conflicts.
+  SmallVector<SmallVector<AxisRefAttr>> axesPerFactor;
+  axesPerFactor.reserve(factorSizes.size());
   for (int64_t i = 0; i < factorSizes.size(); ++i) {
-    result.push_back(getCompatibleMajorAxes(projection, i, direction, op));
+    axesPerFactor.push_back(
+        getCompatibleMajorAxes(projection, i, direction, op));
   }
 
-  // Removes the conflicts within every single factor. This strategy and
-  // `BasicFactorPropagation` handles conflicts within a factor in the same way.
+  // Step 2. Propagate the axes got in Step 1, considering conflicts within the
+  // factor, ignoring the conflicts between factors.
+  SmallVector<FactorIndexToSharding> newShardings;
+  newShardings.reserve(projection.getNumOperandsAndResults());
   for (const TensorFactorShardings& tensorFactorShardings :
        llvm::concat<const TensorFactorShardings>(projection.getOperands(),
                                                  projection.getResults())) {
+    newShardings.push_back(tensorFactorShardings.factorIndexToSharding);
     for (const auto& [factorIndex, factorSharding] :
          tensorFactorShardings.factorIndexToSharding) {
-      truncateAxesByRemovingConflicts(
-          result[factorIndex],
+      if (!shouldUpdate(factorSharding.axisRefs, axesPerFactor[factorIndex])) {
+        continue;
+      }
+      SmallVector<AxisRefAttr> newAxes = truncateAxesByRemovingConflicts(
+          axesPerFactor[factorIndex],
           [&, factorIndex = factorIndex, &factorSharding = factorSharding](
               AxisRefAttr axisRef, int64_t shardedSize) {
             return compatiblePrefixNoConflictsWithinFactor(
                 axisRef, tensorFactorShardings.replicatedAxes, factorSharding,
                 shardedSize, factorSizes[factorIndex]);
           },
           mesh, conservativePropagation);
+      newShardings.back()[factorIndex].axisRefs = newAxes;
     }
   }
 
-  // Removes the conflicts across factors, where this strategy and
-  // `BasicFactorPropagation` diverge.
-  //
-  // With `BasicFactorPropagation`, the compatible axes of a factor Fi cannot
-  // overlap with the existing sharding axes or the overflow axes related to all
-  // other factors. This criterion is considered for all tensors, no matter if
-  // Fi is mapped to the tensor or not. The table below shows the criterion:
-  //
-  //                  existing sharding axes & overflow axes   new sharding axes
-  // factor in tensor              remove overlap                      -
-  // factor not in tensor          remove overlap                      -
-  //
-  // On the contrary, `AggressiveFactorPropagation` has the following criterion:
-  //
-  //                  existing sharding axes & overflow axes   new sharding axes
-  // factor in tensor              remove overlap               remove overlap
-  // factor not in tensor                 -                            -
-  //
-  // There are two differences:
-  //
-  // 1. `BasicFactorPropagation` removes the overlap between the compatible axes
-  // of a factor Fi with the existing sharding axes and overflow axes in a
-  // tensor Tj even if Fi is not in Tj. `AggressiveFactorPropagation` does not
-  // remove this overlap if Fi is not in Tj. `BasicFactorPropagation` is too
-  // strict, since we cannot propagate sharding axes to Tj along Fi.
-  //
-  // `AggressiveFactorPropagation` cannot handle the following case if we only
-  // have difference #1. `-` means that the factor is not mapped to the tensor.
-  // After removing conflicts within factors, we will propagate "x" to T2 along
-  // F0 and F1 at the same time, which induces a conflict. To resolve this
-  // conflict, we have difference #2.
-  //
-  //     F0   F1
-  // T0  "x"   -
-  // T1   -   "x"
-  // T2   ?    ?
-  //
-  // 2. `AggressiveFactorPropagation` removes the overlap between compatible
-  // axes of a factor Fi with the potential new sharding axes of other factors
-  // in Tj if Fi is in Tj. Thus, it is safe to propagate the axes to Tj along Fi
-  // without conflicts with other factors. In the example, we will not propagate
-  // "x" along F0 or F1 since their potential new sharding axes overlap.
-  //
-  // The potential new sharding axes are saved in `resultSnapshot`. It is a hard
-  // copy since we need to handle the following case.
-  //
-  //     F0   F1   F2
-  // T0  "x"   -    -
-  // T1   -   "x"   -
-  // T2   -    -   "x"
-  // T3   ?    ?    ?
-  //
-  // The `result` and `resultSnapshot` is [["x"], ["x"], ["x"]] before removing
-  // conflicts across factors. After removing conflicts between F0/F1 and other
-  // factors, `result` is [[], [], ["x"]]. When we remove conflicts between F2
-  // and other factors, if we use `result` as the potential new sharding axes,
-  // we will not remove "x" for F2 because it is no longer present in 'result'
-  // for F0 and F1. We have to use `resultSnapshot` to save the potential new
-  // sharding axes and remove "x" for F2.
-  const AxesPerFactor resultSnapshot = result;
-  for (const TensorFactorShardings& tensorFactorSharding :
-       llvm::concat<const TensorFactorShardings>(projection.getOperands(),
-                                                 projection.getResults())) {
-    for (const auto& [factorIndex, factorSharding] :
-         tensorFactorSharding.factorIndexToSharding) {
-      truncateAxesByRemovingConflicts(
-          result[factorIndex],
+  // Step 3. Remove conflicts (overlapping sharding axes) between factors in
+  // `newShardings`. Update (1) shardings in `projection` and (2) `result` based
+  // on new sharding axes.
+  for (const auto& [tensorIndex, newSharding] : llvm::enumerate(newShardings)) {
+    bool tensorUpdated = false;
+    for (const auto& [factorIndex, factorSharding] : newSharding) {
+      SmallVector<AxisRefAttr> newAxes = truncateAxesByRemovingConflicts(
+          factorSharding.axisRefs,
           [&, factorIndex = factorIndex](AxisRefAttr axisRef, int64_t) {
             return compatiblePrefixNoConflictsAcrossFactors(
-                axisRef, tensorFactorSharding.factorIndexToSharding,
-                factorIndex, resultSnapshot);
+                axisRef, newSharding, factorIndex);
           },
           mesh, conservativePropagation);
+      tensorUpdated |=
+          updateTensorSharding(projection, tensorIndex, factorIndex, newAxes);
+    }
+    if (tensorIndex < projection.getNumOperands()) {
+      result.updateOperands[tensorIndex] = tensorUpdated;
+    } else {
+      result.updateResults[tensorIndex - projection.getNumOperands()] =
+          tensorUpdated;
     }
-  }
-
-  return result;
-}
-
-UpdateTensorShardings AggressiveFactorPropagation::propagateFactorShardings(
-    ShardingProjection& projection, PropagationDirection direction,
-    ArrayRef<int64_t> factorSizes, MeshAttr mesh, Operation* op,
-    bool conservativePropagation) const {
-  UpdateTensorShardings result{
-      .updateOperands = BitVector(projection.getNumOperands()),
-      .updateResults = BitVector(projection.getNumResults())};
-
-  // We get the compatible major sharding axes for all factors.
-  AxesPerFactor axesPerFactor = getCompatibleMajorShardingAxesForAllFactors(
-      projection, direction, factorSizes, mesh, op, conservativePropagation);
-
-  for (auto [factorIndex, axesToPropagate] : llvm::enumerate(axesPerFactor)) {
-    // Update all shardings along this factor if possible.
-    auto [updateOperandForFactor, updateResultForFactor] =
-        projection.updateSharding(factorIndex, axesToPropagate);
-
-    result.updateOperands |= updateOperandForFactor;
-    result.updateResults |= updateResultForFactor;
   }
 
   return result;
 
@@ -22,40 +22,50 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "shardy/dialect/sdy/ir/dialect.h"
 #include "shardy/dialect/sdy/transforms/propagation/basic_factor_propagation.h"
+#include "shardy/dialect/sdy/transforms/propagation/factor_propagation.h"
 #include "shardy/dialect/sdy/transforms/propagation/sharding_projection.h"
 
 namespace mlir {
 namespace sdy {
 
-// An aggressive strategy of propagating sharding axes along factors.
+// An aggressive strategy of propagating sharding axes along factors. There are
+// two main differences from `BasicFactorPropagation`.
 //
-// This strategy is the same as `BasicFactorPropagation` on the conflicts within
-// a factor. They are different on the conflicts across factors.
+// `BasicFactorPropagation` propagates the same sharding axes to all the tensors
+// along a factor. This strategy can propagate  different sharding axes to
+// different tensors. For example, Tensors T0, T1, T2 contains Factor F0. T0/F0
+// is already sharded along ["a", "b"], and "b" is already used by T2 ("b" can
+// explicitly replicated, or it is used to shard another factor).
+// `BasicFactorPropagation` can only propagate ["a"] to both T1/F0 and T2/F0,
+// while this strategy can propagate ["a", "b"] to T1/F0 and ["a"] to T2/F0.
 //
-// `BasicFactorPropagation` considers the conflicts across factors with a strict
-// criterion. The result cannot overlap with the sharded axes or overflow axes
-// related to all other factors. This aggressive strategy ignores "fake
-// conflicts", which are propagation choices that can co-exist. This aggressive
-// strategy ensures that the resultant axes can be propagated to all tensors
-// containing the factor. Several examples of fake conflicts:
+// `BasicFactorPropagation` is conservative in terms of conflicts across
+// factors. If an axis (or sub-axis) appears in two factor shardings, it cannot
+// be propagated. This strategy is more aggressive by allowing the same axis (or
+// sub-axis) along different factors if the result shardings are legal (all the
+// tensors can only be sharded by one axis at most once).
 //
-// 1. An axis is in factors Fi and Fj. If it is infeasible to propagate that
-// axis along factor Fi, we may propagate that axis along factor Fj if all the
-// destination tensors have not used that axis.
+// Let us take C = dot(A, B) as an example. F0 is the factor corresponding to a
+// non-contracting dimension of A. F1 corresponds to a non-contracting dimension
+// of B. F2 corresponds to a contracting dimension. "-" means that the tensor
+// does not contain the factor.
 //
-// 2. Two factors Fi and Fj do not co-exist in any tensor, so they never
-// interfere with each other. If Fi and Fj are sharded along the same axis, we
-// can propagate that axis along both factors.
+//     F0    F1    F2
+// A  "a"    -
+// B   -
+// C        "a"    -
+// Case 1. Fake conflict. `BasicFactorPropagation` propagates nothing, while
+// this strategy propagates "a" to B/F1.
 //
-// Although fake conflicts can co-exist without inference, we may still need to
-// all-gather some tensors.
+//     F0    F1    F2
+// A  "a"    -
+// B   -    "a"
+// C               -
+// Case 2. Real conflict. Both `BasicFactorPropagation` and this strategy
+// propagate nothing. We can propagate "a" to C/F0 or C/F1, which is illegal
+// since "a" cannot be used twice in C.
 class AggressiveFactorPropagation : public BasicFactorPropagation {
  public:
-  AxesPerFactor getCompatibleMajorShardingAxesForAllFactors(
-      const ShardingProjection& projection, PropagationDirection direction,
-      ArrayRef<int64_t> factorSizes, MeshAttr mesh, Operation* op,
-      bool conservativePropagation) const override;
-
   UpdateTensorShardings propagateFactorShardings(
       ShardingProjection& projection, PropagationDirection direction,
       ArrayRef<int64_t> factorSizes, MeshAttr mesh, Operation* op,