Add conv_general_dilated sharding rule

yashk2810 · Google-ML-Automation · commit 38483f7e4250 · 2025-04-25T12:58:49.000-07:00
This rule only works when rhs is fully replicated or rhs's mesh is empty (i.e. rhs is a numpy array or jnp.array). In this case, we just forward the sharding of lhs to the output (after making sure that the out_shape even divides the sharding) And since reduce_window is the exact same thing as the above case (i.e. lhs sharded, rhs fully replicated), do the same in it's sharding rule. Fixes #28090 PiperOrigin-RevId: 748736039
diff --git a/jax/_src/lax/convolution.py b/jax/_src/lax/convolution.py
@@ -53,6 +53,8 @@ class ConvDimensionNumbers(NamedTuple):
     None,
 ]
 
+# TODO(yashkatariya): conv_general_dilated should take `out_sharding` argument
+# similar to `dot_general`
 def conv_general_dilated(
   lhs: Array, rhs: Array, window_strides: Sequence[int],
   padding: str | Sequence[tuple[int, int]],
@@ -415,6 +417,26 @@ def _conv_general_dilated_shape_rule(
   return tuple(np.take(out_trans, np.argsort(out_perm)))
 
 
+def _conv_general_dilated_sharding_rule(
+    lhs: core.ShapedArray, rhs: core.ShapedArray, *, window_strides, padding,
+    lhs_dilation, rhs_dilation, dimension_numbers, feature_group_count,
+    batch_group_count, **unused_kwargs):
+  # Only allow if rhs is fully replicated and lhs's feature dim is not sharded
+  if ((rhs.sharding.mesh.empty or rhs.sharding.is_fully_replicated) and
+      lhs.sharding.spec[dimension_numbers.lhs_spec[1]] is None):
+    out_shape = _conv_general_dilated_shape_rule(
+        lhs, rhs, window_strides=window_strides, padding=padding,
+        lhs_dilation=lhs_dilation, rhs_dilation=rhs_dilation,
+        dimension_numbers=dimension_numbers,
+        feature_group_count=feature_group_count,
+        batch_group_count=batch_group_count)
+    return lax.slicing._get_sharding_for_varying_out_shape(
+        out_shape, lhs, "conv_general_dilated")
+  # TODO(yashkatariya): In this case, just let the user specify the out_sharding
+  # via `out_sharding` argument to `conv_general_dilated`.
+  raise core.ShardingTypeError(
+      "Please file an issue at https://github.com/jax-ml/jax/issues")
+
 def _conv_general_dilated_dtype_rule(
     lhs, rhs, *, window_strides, padding, lhs_dilation, rhs_dilation,
     dimension_numbers, preferred_element_type, **unused_kwargs):
@@ -635,6 +657,7 @@ def _conv_general_dilated_batch_rule(
 conv_general_dilated_p = lax.standard_primitive(
     _conv_general_dilated_shape_rule, _conv_general_dilated_dtype_rule,
     'conv_general_dilated',
+    sharding_rule=_conv_general_dilated_sharding_rule,
     vma_rule=partial(core.standard_vma_rule, 'conv_general_dilated'))
 
 ad.defbilinear(conv_general_dilated_p,
@@ -713,21 +736,18 @@ def _conv_general_dilated_lower(
     # TODO(https://github.com/openxla/stablehlo/issues/1268)
     raise NotImplementedError("Convolutions with non-static strides, dilation, feature_group_count, or batch_group_count")
   if all(core.is_constant_shape(p) for p in padding):
-    return [
-        hlo.convolution(
-          mlir.aval_to_ir_type(aval_out),
-          lhs,
-          rhs,
-          dimension_numbers=dnums,
-          feature_group_count=mlir.i64_attr(feature_group_count),
-          batch_group_count=mlir.i64_attr(batch_group_count),
-          window_strides=mlir.dense_int_array(window_strides),
-          padding=mlir.dense_int_elements(padding),
-          lhs_dilation=mlir.dense_int_array(lhs_dilation),
-          rhs_dilation=mlir.dense_int_array(rhs_dilation),
-          window_reversal=window_reversal,
-          precision_config=lax.precision_attr(precision))
-    ]
+    out = hlo.convolution(
+        mlir.aval_to_ir_type(aval_out), lhs, rhs,
+        dimension_numbers=dnums,
+        feature_group_count=mlir.i64_attr(feature_group_count),
+        batch_group_count=mlir.i64_attr(batch_group_count),
+        window_strides=mlir.dense_int_array(window_strides),
+        padding=mlir.dense_int_elements(padding),
+        lhs_dilation=mlir.dense_int_array(lhs_dilation),
+        rhs_dilation=mlir.dense_int_array(rhs_dilation),
+        window_reversal=window_reversal,
+        precision_config=lax.precision_attr(precision))
+    return [mlir.lower_with_sharding_in_types(ctx, out, aval_out)]
   else:
     # d_padding will be an array i32[N, 2] with pad_lo and pad_hi for each
     # spatial dimension.
diff --git a/jax/_src/lax/windowed_reductions.py b/jax/_src/lax/windowed_reductions.py
@@ -520,21 +520,11 @@ def _reduce_window_batch_rule(reduce_window, batched_args, bdims, *,
 
 def reduce_window_sharding_rule(operand, window_dimensions, window_strides,
                                 padding, base_dilation, window_dilation):
-  if base_dilation is None:
-    base_dilation = [1] * operand.ndim
-  if window_dilation is None:
-    window_dilation = [1] * operand.ndim
-
-  for spec, wdim, ws, pd, bd, wdil in zip(
-      operand.sharding.spec, window_dimensions, window_strides, padding,
-      base_dilation, window_dilation):
-    if spec is None:
-      continue
-    if not (wdim == 1 and ws == 1 and pd == (0, 0) and bd == 1 and wdil == 1):
-      raise core.ShardingTypeError(
-          "Only trivial windowing is supported along non-replicated"
-          f" dimensions. Got {operand.sharding.spec=}")
-  return operand.sharding
+  out_shape = reduce_window_shape_tuple(
+      operand.shape, window_dimensions, window_strides, padding, base_dilation,
+      window_dilation)
+  return lax.slicing._get_sharding_for_varying_out_shape(
+      out_shape, operand, 'reduce_window')
 
 reduce_window_sum_p = lax.standard_primitive(
     _reduce_window_sum_shape_rule, lax._input_dtype, 'reduce_window_sum',
@@ -680,8 +670,14 @@ def _select_and_scatter_shape_rule(
     raise TypeError(msg.format(window_strides, window_dimensions))
   return operand.shape
 
+def _select_and_scatter_sharding_rule(
+    operand, source, init_value, *, select_jaxpr, select_consts, scatter_jaxpr,
+    scatter_consts, window_dimensions, window_strides, padding):
+  return operand.sharding
+
 select_and_scatter_p = lax.standard_primitive(
     _select_and_scatter_shape_rule, lax._input_dtype, 'select_and_scatter',
+    sharding_rule=_select_and_scatter_sharding_rule,
     vma_rule=partial(core.standard_vma_rule, 'select_and_scatter'))
 
 def _select_and_scatter_lower(
@@ -722,7 +718,8 @@ def _select_and_scatter_lower(
                                       *scatter.arguments,
                                       dim_var_values=ctx.dim_var_values)
     hlo.return_(mlir.flatten_ir_values(out_nodes))
-  return op.results
+  return [mlir.lower_with_sharding_in_types(ctx, r, aval)
+          for r, aval in zip(op.results, ctx.avals_out)]
 
 mlir.register_lowering(select_and_scatter_p, _select_and_scatter_lower)
 
@@ -731,6 +728,11 @@ def _select_and_scatter_add_shape_rule(
     padding):
   return operand.shape
 
+def _select_and_scatter_add_sharding_rule(
+    source, operand, *, select_prim, window_dimensions, window_strides,
+    padding):
+  return operand.sharding
+
 def _select_and_scatter_add_jvp(
     primals, tangents, *, select_prim, window_dimensions, window_strides,
     padding):
@@ -779,6 +781,7 @@ def _select_and_scatter_add_batch_rule(
 select_and_scatter_add_p = lax.standard_primitive(
     _select_and_scatter_add_shape_rule, lax._input_dtype,
     'select_and_scatter_add',
+    sharding_rule=_select_and_scatter_add_sharding_rule,
     vma_rule=partial(core.standard_vma_rule, 'select_and_scatter_add'))
 
 ad.primitive_transposes[select_and_scatter_add_p] = \
diff --git a/tests/pjit_test.py b/tests/pjit_test.py
@@ -7517,6 +7517,58 @@ def f2(x, i, j):
       return x.at[i].set(x_j)
     f2(x,i,j)  # doesn't crash
 
+  @jtu.with_explicit_mesh((4, 2), ('x', 'y'))
+  def test_conv_general_dilated(self, mesh):
+    arr = jax.device_put(np.zeros((16, 128, 8)), P('x', 'y'))
+
+    @jax.jit
+    def f(x):
+      # Conv1D across sharded y-axis:
+      out = jax.lax.conv_general_dilated(
+          x, np.zeros((5, 8, 10)),
+          window_strides=(1,), padding='SAME', feature_group_count=1,
+          lhs_dilation=(1,), rhs_dilation=(1,),
+          dimension_numbers=('NWC', 'WIO', 'NWC'))
+      self.assertEqual(out.aval.sharding.spec, P('x', 'y', None))
+      # Max pooling along sharded y-axis.
+      out2 = jax.lax.reduce_window(
+          out, -np.inf, jax.lax.max, (1,2,1), (1,2,1), 'SAME')
+      self.assertEqual(out2.aval.sharding.spec, P('x', 'y', None))
+      return out2
+
+    out = f(arr)
+    self.assertEqual(out.sharding, NamedSharding(mesh, P('x', 'y', None)))
+    self.check_wsc_in_lowered(f.lower(arr).as_text())
+
+    jax.jit(jax.grad(lambda x: f(x).sum()))(arr)  # doesn't crash
+
+    with self.assertRaises(core.ShardingTypeError):
+      arr2 = jax.device_put(np.zeros((16, 128, 8)), P('x', None, 'y'))
+      f(arr2)
+
+  @parameterized.named_parameters(
+      ('spec1', P('x', 'y', None)),
+      ('spec2', P('x', None, 'y')),
+      ('spec3', P(None, 'x', 'y')),
+      ('spec4', P(('x', 'y'), None, None))
+  )
+  @jtu.with_explicit_mesh((4, 2), ('x', 'y'))
+  def test_reduce_window(self, spec, mesh):
+    arr = jax.device_put(np.zeros((16, 128, 8)), spec)
+
+    @jax.jit
+    def f(x):
+      out = jax.lax.reduce_window(
+          x, -np.inf, jax.lax.max, (1,2,1), (1,2,1), 'SAME')
+      self.assertEqual(out.aval.sharding.spec, spec)
+      return out
+
+    out = f(arr)
+    self.assertEqual(out.sharding, NamedSharding(mesh, spec))
+    self.check_wsc_in_lowered(f.lower(arr).as_text())
+
+    jax.jit(jax.grad(lambda x: f(x).sum()))(arr)  # doesn't crash
+
 
 @jtu.pytest_mark_if_available('multiaccelerator')
 class PJitErrorTest(jtu.JaxTestCase):