Don't allow sideways propagation between operands/results when propagation direction is forward/backwards.

tomnatan30 · copybara-github · commit 538f262271e6 · 2025-03-07T03:44:36.000-08:00
The rational is that a forward/backwards propagation isn't meant to update the sharding of other operands/results.

PiperOrigin-RevId: 734104394
diff --git a/shardy/dialect/sdy/transforms/propagation/aggressive_factor_propagation.cc b/shardy/dialect/sdy/transforms/propagation/aggressive_factor_propagation.cc
@@ -96,6 +96,36 @@ bool isStrictPrefixOfFactorSharding(
   return false;
 }
 
+// Only propagate axes to operands that are also present in at least one result.
+//
+// We want to avoid the following situation which can happen when a
+// `sharding_constraint` is added onto the operand during Shardy import:
+// ```
+// %arg0: [{"a", ?}]
+// %arg1: [{?}]
+// %0 = add %arg0, %arg1 : [{}]
+// ```
+// We don't want to do an all-gather on both %arg0 and %arg1 due to "a"
+// propagating sideways. Instead with the code below, since "a" can't
+// propagate to `%0`, we will only do an all-gather on %arg0.
+//
+// TODO(b/396642774): Long term we should undo this and allow sideways
+// propagation, but have our explicit reshard pass make sure the result is
+// all-gathered instead of both operands.
+void cancelSidewaysPropagationForElementwise(ShardingProjection& projection,
+                                             int64_t factorIndex,
+                                             SmallVector<AxisRefAttr>& newAxes,
+                                             Operation* op) {
+  if (!op || !isElementwise(op)) {
+    return;
+  }
+  for (const TensorFactorShardings& result : projection.getResults()) {
+    if (isStrictPrefixOfFactorSharding(result, factorIndex, newAxes)) {
+      newAxes = result.factorIndexToSharding.at(factorIndex).axisRefs;
+    }
+  }
+}
+
 }  // namespace
 
 SmallVector<AxisRefAttr>
@@ -114,9 +144,7 @@ AggressiveFactorPropagation::getPropagatedFactorSharding(
   // Resolve conflicts within a factor.
   truncateAxesByRemovingConflicts(
       newAxes,
-      [&, factorIndex = factorIndex,
-       &tensorFactorShardings = tensorFactorShardings](
-          AxisRefAttr axisRef, int64_t prevShardedSize) {
+      [&](AxisRefAttr axisRef, int64_t prevShardedSize) {
         return compatiblePrefixNoConflictsWithinFactor(
             axisRef, tensorFactorShardings.replicatedAxes, factorSharding,
             prevShardedSize, factorSizes[factorIndex], mesh);
@@ -133,7 +161,7 @@ AggressiveFactorPropagation::getPropagatedFactorSharding(
   // checking for conflicts w.r.t. the updated state of this tensor.
   truncateAxesByRemovingConflicts(
       newAxes,
-      [&, factorIndex = factorIndex](AxisRefAttr axisRef, int64_t) {
+      [&](AxisRefAttr axisRef, int64_t) {
         return compatiblePrefixNoConflictsAcrossFactors(
             axisRef, factorIndexToSharding, factorIndex);
       },
@@ -182,71 +210,54 @@ UpdateTensorShardings AggressiveFactorPropagation::propagateFactorShardings(
                                  factorToSourceTensor[j].index, j);
   });
 
-  for (const auto& [tensorIndex, tensorFactorShardings] :
-       llvm::enumerate(projection.getResults())) {
-    const FactorIndexToSharding& factorIndexToSharding =
-        tensorFactorShardings.factorIndexToSharding;
-
-    // Propagate the axes got in Step 1, resolving conflicts between factors by
-    // following the order of preference in  `sortedFactorIndices`.
-    bool tensorUpdated = false;
-    for (int64_t factorIndex : sortedFactorIndices) {
-      SmallVector<AxisRefAttr> newAxes = getPropagatedFactorSharding(
-          factorIndex, tensorFactorShardings, factorIndexToSharding,
-          axesPerFactor, mesh, conservativePropagation, factorSizes);
-      if (newAxes.empty()) {
-        continue;
+  // Propagate the axes got in Step 1, resolving conflicts between factors by
+  // following the order of preference in  `sortedFactorIndices`.
+  for (int64_t factorIndex : sortedFactorIndices) {
+    PropagationDirection direction = directionAlongFactor(factorIndex);
+    // 1. Propagate to results.
+    //
+    // We don't propagate sideways between results in backwards propagation,
+    // so the sharding of the results along this factor shouldn't change.
+    if (direction != PropagationDirection::BACKWARD) {
+      for (const auto& [tensorIndex, tensorFactorShardings] :
+           llvm::enumerate(projection.getResults())) {
+        SmallVector<AxisRefAttr> newAxes = getPropagatedFactorSharding(
+            factorIndex, tensorFactorShardings,
+            tensorFactorShardings.factorIndexToSharding, axesPerFactor, mesh,
+            conservativePropagation, factorSizes);
+        if (newAxes.empty()) {
+          continue;
+        }
+        if (expandTensorSharding(projection,
+                                 tensorIndex + projection.getNumOperands(),
+                                 factorIndex, newAxes)) {
+          result.updateResults.set(tensorIndex);
+        }
       }
-      tensorUpdated |= expandTensorSharding(
-          projection, tensorIndex + projection.getNumOperands(), factorIndex,
-          newAxes);
     }
-    result.updateResults[tensorIndex] = tensorUpdated;
-  }
-
-  for (const auto& [tensorIndex, tensorFactorShardings] :
-       llvm::enumerate(projection.getOperands())) {
-    const FactorIndexToSharding& factorIndexToSharding =
-        tensorFactorShardings.factorIndexToSharding;
-
-    // Propagate the axes got in Step 1, resolving conflicts between factors by
-    // following the order of preference in  `sortedFactorIndices`.
-    bool tensorUpdated = false;
-    for (int64_t factorIndex : sortedFactorIndices) {
-      SmallVector<AxisRefAttr> newAxes = getPropagatedFactorSharding(
-          factorIndex, tensorFactorShardings, factorIndexToSharding,
-          axesPerFactor, mesh, conservativePropagation, factorSizes);
-      if (newAxes.empty()) {
-        continue;
-      }
 
-      // Only propagate sideways through operands the factors that are also
-      // used in at least one result We want to avoid the following situation
-      // which can happen when a `sharding_constraint` is added onto the operand
-      // during Shardy import:
-      // ```
-      // %arg0: [{"a", ?}]
-      // %arg1: [{?}]
-      // %0 = add %arg0, %arg1 : [{}]
-      // ```
-      // We don't want to do an all-gather on both %arg0 and %arg1 due to "a"
-      // propagating sideways. Instead with the code below, since "a" can't
-      // propagate to `%0`, we will only do an all-gather on %arg0.
-      //
-      // TODO(b/396642774): Long term we should undo this and allow sideways
-      // propagation, but have our explicit reshard pass make sure the result is
-      // all-gathered instead of both operands.
-      if (op && isElementwise(op)) {
-        for (const TensorFactorShardings& result : projection.getResults()) {
-          if (isStrictPrefixOfFactorSharding(result, factorIndex, newAxes)) {
-            newAxes = result.factorIndexToSharding.at(factorIndex).axisRefs;
-          }
+    // 2. Propagate to operands.
+    //
+    // We don't propagate sideways between operands in forward propagation,
+    // so the sharding of the operands along this factor shouldn't change.
+    if (direction != PropagationDirection::FORWARD) {
+      for (const auto& [tensorIndex, tensorFactorShardings] :
+           llvm::enumerate(projection.getOperands())) {
+        SmallVector<AxisRefAttr> newAxes = getPropagatedFactorSharding(
+            factorIndex, tensorFactorShardings,
+            tensorFactorShardings.factorIndexToSharding, axesPerFactor, mesh,
+            conservativePropagation, factorSizes);
+        if (newAxes.empty()) {
+          continue;
+        }
+        cancelSidewaysPropagationForElementwise(projection, factorIndex,
+                                                newAxes, op);
+        if (expandTensorSharding(projection, tensorIndex, factorIndex,
+                                 newAxes)) {
+          result.updateOperands.set(tensorIndex);
         }
       }
-      tensorUpdated |=
-          expandTensorSharding(projection, tensorIndex, factorIndex, newAxes);
     }
-    result.updateOperands[tensorIndex] = tensorUpdated;
   }
   return result;
 }
diff --git a/shardy/dialect/sdy/transforms/propagation/aggressive_factor_propagation_test.cc b/shardy/dialect/sdy/transforms/propagation/aggressive_factor_propagation_test.cc
@@ -397,6 +397,103 @@ TEST_F(AggressiveFactorPropagationTest, PropagateAlongSpecificFactor) {
   propagateAlongFactor(propagateAnything(), propagateAlongFactor0Expected);
 }
 
+// NOTE: This test is the same as the one in basic_factor_propagation_test.cc,
+// and verifies that we get the expected behavior in both strategies.
+TEST_F(AggressiveFactorPropagationTest,
+       DifferentDirectionsForDifferentFactors) {
+  ShardingProjection projection(
+      /*operands=*/
+      {{.factorIndexToSharding = {{0, {.axisRefs = {createAxis("a")}}},
+                                  {1, {.axisRefs = {createAxis("b")}}},
+                                  {2, {.axisRefs = {createAxis("c")}}},
+                                  {3, {.axisRefs = {createAxis("d")}}},
+                                  {4, {.axisRefs = {}}},
+                                  {5, {.axisRefs = {}}},
+                                  {6, {.axisRefs = {}}},
+                                  {7, {.axisRefs = {}}}}},
+       {.factorIndexToSharding = {{0, {.axisRefs = {}}},
+                                  {1, {.axisRefs = {}}},
+                                  {2, {.axisRefs = {}}},
+                                  {3, {.axisRefs = {}}},
+                                  {4, {.axisRefs = {}}},
+                                  {5, {.axisRefs = {}}},
+                                  {6, {.axisRefs = {}}},
+                                  {7, {.axisRefs = {}}}}}},
+      /*results=*/
+      {{.factorIndexToSharding = {{0, {.axisRefs = {}}},
+                                  {1, {.axisRefs = {}}},
+                                  {2, {.axisRefs = {}}},
+                                  {3, {.axisRefs = {}}},
+                                  {4, {.axisRefs = {createAxis("e")}}},
+                                  {5, {.axisRefs = {createAxis("f")}}},
+                                  {6, {.axisRefs = {}}},
+                                  {7, {.axisRefs = {createAxis("h")}}}}},
+       {.factorIndexToSharding = {{0, {.axisRefs = {}}},
+                                  {1, {.axisRefs = {}}},
+                                  {2, {.axisRefs = {}}},
+                                  {3, {.axisRefs = {}}},
+                                  {4, {.axisRefs = {}}},
+                                  {5, {.axisRefs = {}}},
+                                  {6, {.axisRefs = {createAxis("g")}}},
+                                  {7, {.axisRefs = {}}}}}});
+
+  PropagationDirectionAlongFactor directionAlongFactor =
+      [](int64_t factorIndex) {
+        if (factorIndex == 0 || factorIndex == 4) {
+          return PropagationDirection::BOTH;
+        }
+        if (factorIndex == 1 || factorIndex == 5) {
+          return PropagationDirection::FORWARD;
+        }
+        if (factorIndex == 2 || factorIndex == 6) {
+          return PropagationDirection::BACKWARD;
+        }
+        return PropagationDirection::NONE;
+      };
+
+  ShardingProjection projectionExpected(
+      /*operands=*/
+      {{.factorIndexToSharding = {{0, {.axisRefs = {createAxis("a")}}},
+                                  {1, {.axisRefs = {createAxis("b")}}},
+                                  {2, {.axisRefs = {createAxis("c")}}},
+                                  {3, {.axisRefs = {createAxis("d")}}},
+                                  {4, {.axisRefs = {createAxis("e")}}},
+                                  {5, {.axisRefs = {}}},
+                                  {6, {.axisRefs = {createAxis("g")}}},
+                                  {7, {.axisRefs = {}}}}},
+       {.factorIndexToSharding = {{0, {.axisRefs = {createAxis("a")}}},
+                                  {1, {.axisRefs = {}}},
+                                  {2, {.axisRefs = {}}},
+                                  {3, {.axisRefs = {}}},
+                                  {4, {.axisRefs = {createAxis("e")}}},
+                                  {5, {.axisRefs = {}}},
+                                  {6, {.axisRefs = {createAxis("g")}}},
+                                  {7, {.axisRefs = {}}}}}},
+      /*results=*/
+      {{.factorIndexToSharding = {{0, {.axisRefs = {createAxis("a")}}},
+                                  {1, {.axisRefs = {createAxis("b")}}},
+                                  {2, {.axisRefs = {}}},
+                                  {3, {.axisRefs = {}}},
+                                  {4, {.axisRefs = {createAxis("e")}}},
+                                  {5, {.axisRefs = {createAxis("f")}}},
+                                  {6, {.axisRefs = {}}},
+                                  {7, {.axisRefs = {createAxis("h")}}}}},
+       {.factorIndexToSharding = {{0, {.axisRefs = {createAxis("a")}}},
+                                  {1, {.axisRefs = {createAxis("b")}}},
+                                  {2, {.axisRefs = {}}},
+                                  {3, {.axisRefs = {}}},
+                                  {4, {.axisRefs = {createAxis("e")}}},
+                                  {5, {.axisRefs = {}}},
+                                  {6, {.axisRefs = {createAxis("g")}}},
+                                  {7, {.axisRefs = {}}}}}});
+
+  auto [updateOperands, updateResults] =
+      propagateFactorShardings(projection, 8, directionAlongFactor);
+  EXPECT_THAT(toSetBitsVector(updateOperands), ElementsAre(0, 1));
+  EXPECT_THAT(toSetBitsVector(updateResults), ElementsAre(0, 1));
+  EXPECT_EQ(projection, projectionExpected);
+}
+
 // NOLINTEND(clang-diagnostic-pre-c++20-compat-pedantic)
 
 }  // namespace
diff --git a/shardy/dialect/sdy/transforms/propagation/basic_factor_propagation.cc b/shardy/dialect/sdy/transforms/propagation/basic_factor_propagation.cc
@@ -398,16 +398,17 @@ UpdateTensorShardings BasicFactorPropagation::propagateFactorShardings(
 
   // We propagate each factor separately.
   for (auto [factorIndex, factorSize] : llvm::enumerate(factorSizes)) {
+    PropagationDirection direction = directionAlongFactor(factorIndex);
     // For each factor, find the compatible major sharding axes that can shard
     // that factor for all tensors, those are the axes we will propagate to
     // tensors that aren't already sharded.
     SmallVector<AxisRefAttr> axesToPropagate = getCompatibleMajorShardingAxes(
-        projection, factorIndex, directionAlongFactor(factorIndex), factorSize,
-        mesh, op, conservativePropagation);
+        projection, factorIndex, direction, factorSize, mesh, op,
+        conservativePropagation);
 
     // Update all shardings along this factor if possible.
     auto [updateOperandForFactor, updateResultForFactor] =
-        projection.expandSharding(factorIndex, axesToPropagate);
+        projection.expandSharding(factorIndex, axesToPropagate, direction);
 
     result.updateOperands |= updateOperandForFactor;
     result.updateResults |= updateResultForFactor;
diff --git a/shardy/dialect/sdy/transforms/propagation/basic_factor_propagation_test.cc b/shardy/dialect/sdy/transforms/propagation/basic_factor_propagation_test.cc
@@ -513,15 +513,15 @@ TEST_F(BasicFactorPropagationTest, DifferentDirectionsForDifferentFactors) {
                                   {3, {.axisRefs = {}}},
                                   {4, {.axisRefs = {createAxis("e")}}},
                                   {5, {.axisRefs = {createAxis("f")}}},
-                                  {6, {.axisRefs = {createAxis("g")}}},
+                                  {6, {.axisRefs = {}}},
                                   {7, {.axisRefs = {createAxis("h")}}}}},
        {.factorIndexToSharding = {{0, {.axisRefs = {}}},
                                   {1, {.axisRefs = {}}},
                                   {2, {.axisRefs = {}}},
                                   {3, {.axisRefs = {}}},
                                   {4, {.axisRefs = {}}},
                                   {5, {.axisRefs = {}}},
-                                  {6, {.axisRefs = {}}},
+                                  {6, {.axisRefs = {createAxis("g")}}},
                                   {7, {.axisRefs = {}}}}}});
 
   PropagationDirectionAlongFactor directionAlongFactor =
@@ -549,7 +549,7 @@ TEST_F(BasicFactorPropagationTest, DifferentDirectionsForDifferentFactors) {
                                   {6, {.axisRefs = {createAxis("g")}}},
                                   {7, {.axisRefs = {}}}}},
        {.factorIndexToSharding = {{0, {.axisRefs = {createAxis("a")}}},
-                                  {1, {.axisRefs = {createAxis("b")}}},
+                                  {1, {.axisRefs = {}}},
                                   {2, {.axisRefs = {}}},
                                   {3, {.axisRefs = {}}},
                                   {4, {.axisRefs = {createAxis("e")}}},
@@ -563,7 +563,7 @@ TEST_F(BasicFactorPropagationTest, DifferentDirectionsForDifferentFactors) {
                                   {3, {.axisRefs = {}}},
                                   {4, {.axisRefs = {createAxis("e")}}},
                                   {5, {.axisRefs = {createAxis("f")}}},
-                                  {6, {.axisRefs = {createAxis("g")}}},
+                                  {6, {.axisRefs = {}}},
                                   {7, {.axisRefs = {createAxis("h")}}}}},
        {.factorIndexToSharding = {{0, {.axisRefs = {createAxis("a")}}},
                                   {1, {.axisRefs = {createAxis("b")}}},
diff --git a/shardy/dialect/sdy/transforms/propagation/sharding_projection.cc b/shardy/dialect/sdy/transforms/propagation/sharding_projection.cc
@@ -190,13 +190,24 @@ TensorShardingAttr TensorFactorShardings::createTensorShardingAttr(
 }
 
 UpdateTensorShardings ShardingProjection::expandSharding(
-    int64_t factorIndex, ArrayRef<AxisRefAttr> newAxes) {
+    int64_t factorIndex, ArrayRef<AxisRefAttr> newAxes,
+    PropagationDirection direction) {
   UpdateTensorShardings result(getNumOperands(), getNumResults());
-  for (auto [i, tensor] : llvm::enumerate(operands)) {
-    result.updateOperands[i] = tensor.expandShardingAxes(factorIndex, newAxes);
+  if (direction == PropagationDirection::NONE) {
+    return result;
   }
-  for (auto [i, tensor] : llvm::enumerate(results)) {
-    result.updateResults[i] = tensor.expandShardingAxes(factorIndex, newAxes);
+  // We don't propagate sideways between operands in forward propagation.
+  if (direction != PropagationDirection::FORWARD) {
+    for (auto [i, tensor] : llvm::enumerate(operands)) {
+      result.updateOperands[i] =
+          tensor.expandShardingAxes(factorIndex, newAxes);
+    }
+  }
+  // We don't propagate sideways between results in backwards propagation.
+  if (direction != PropagationDirection::BACKWARD) {
+    for (auto [i, tensor] : llvm::enumerate(results)) {
+      result.updateResults[i] = tensor.expandShardingAxes(factorIndex, newAxes);
+    }
   }
   return result;
 }
diff --git a/shardy/dialect/sdy/transforms/propagation/sharding_projection.h b/shardy/dialect/sdy/transforms/propagation/sharding_projection.h
@@ -208,8 +208,15 @@ class ShardingProjection {
   // Expands the shardings of all tensors that are associated with
   // `factorIndex` to be `newAxes` for that factor. Returns two BitVectors
   // indicating whether the operands and results have been expanded.
+  //
+  // If direction is:
+  // - BOTH, both operands and results can be updated.
+  // - FORWARD, only results can be updated.
+  // - BACKWARD, only operands can be updated.
+  // - NONE, no tensors are updated.
   UpdateTensorShardings expandSharding(int64_t factorIndex,
-                                       ArrayRef<AxisRefAttr> newAxes);
+                                       ArrayRef<AxisRefAttr> newAxes,
+                                       PropagationDirection direction);
 
   // Updates the shardings of all tensors that are associated with
   // `factorIndex` to be `newAxes` and `newOverflowAxes` for that factor. Keep
diff --git a/shardy/dialect/sdy/transforms/propagation/sharding_projection_test.cc b/shardy/dialect/sdy/transforms/propagation/sharding_projection_test.cc
diff --git a/shardy/dialect/sdy/transforms/propagation/test/op_priority_propagation.mlir b/shardy/dialect/sdy/transforms/propagation/test/op_priority_propagation.mlir