AlphaGPU · kunal-mansukhani · Apr 5, 2026 · Apr 5, 2026
@@ -0,0 +1,42 @@
+<h2>Multi-GPU Vector Addition</h2>
+
+<p>
+  Compute the element-wise sum of two vectors <code>A</code> and <code>B</code>
+  into output vector <code>C</code> on <strong>2 GPUs</strong>.
+</p>
+
+<p>
+  Your <code>solve()</code> function is invoked once on each rank. The same
+  input tensors <code>A</code> and <code>B</code> are available on every rank
+  (fully replicated), and each rank must produce the full result in its own
+  <code>C</code>. You may parallelize the work across ranks and use collectives
+  (PyTorch: <code>torch.distributed</code>; CUDA: NCCL with the rank/world-size
+  and NCCL unique-id file exposed via environment variables) to combine partial
+  results.
+</p>
+
+<p>Environment variables available inside <code>solve()</code>:</p>
+<ul>
+  <li><code>RANK</code>: this rank's index (0-based)</li>
+  <li><code>WORLD_SIZE</code>: total number of ranks (2 for this challenge)</li>
+  <li><code>LOCAL_RANK</code>: local device index for this rank</li>
+  <li><code>LEETGPU_NCCL_ID_FILE</code>: (CUDA only) path to a shared file for
+      bootstrapping NCCL via <code>ncclGetUniqueId</code> /
+      <code>ncclCommInitRank</code></li>
+</ul>
+
+<p>
+  For PyTorch solutions the process group (<code>nccl</code> backend) is
+  pre-initialized by the runner — just call
+  <code>torch.distributed.all_reduce</code>,
+  <code>torch.distributed.all_gather</code>, etc.
+</p>
+
+<h3>Constraints</h3>
+<ul>
+  <li><code>1 &le; N &le; 10<sup>7</sup></code></li>
+  <li><code>A</code>, <code>B</code>, <code>C</code> are contiguous
+      <code>float32</code> tensors on the GPU</li>
+</ul>
+
+<p><strong>This is a Pro-only multi-GPU challenge.</strong></p>
@@ -0,0 +1,68 @@
+import ctypes
+from typing import Any, Dict, List
+
+import torch
+from core.challenge_base import ChallengeBase
+
+
+class Challenge(ChallengeBase):
+    def __init__(self):
+        super().__init__(
+            name="Multi-GPU Vector Addition",
+            atol=1e-05,
+            rtol=1e-05,
+            num_gpus=2,
+            access_tier="pro",
+        )
+
+    def reference_impl(self, A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, N: int):
+        assert A.shape == B.shape == C.shape
+        assert A.dtype == B.dtype == C.dtype
+        assert A.device == B.device == C.device
+
+        torch.add(A, B, out=C)
+
+    def get_solve_signature(self) -> Dict[str, tuple]:
+        return {
+            "A": (ctypes.POINTER(ctypes.c_float), "in"),
+            "B": (ctypes.POINTER(ctypes.c_float), "in"),
+            "C": (ctypes.POINTER(ctypes.c_float), "out"),
+            "N": (ctypes.c_size_t, "in"),
+        }
+
+    def generate_example_test(self) -> Dict[str, Any]:
+        dtype = torch.float32
+        N = 8
+        A = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], device="cuda", dtype=dtype)
+        B = torch.tensor([8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0], device="cuda", dtype=dtype)
+        C = torch.empty(N, device="cuda", dtype=dtype)
+        return {
+            "A": A,
+            "B": B,
+            "C": C,
+            "N": N,
+        }
+
+    def generate_functional_test(self) -> List[Dict[str, Any]]:
+        dtype = torch.float32
+        test_cases = []
+        for size in (8, 64, 1024, 8192):
+            test_cases.append(
+                {
+                    "A": torch.empty(size, device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
+                    "B": torch.empty(size, device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
+                    "C": torch.zeros(size, device="cuda", dtype=dtype),
+                    "N": size,
+                }
+            )
+        return test_cases
+
+    def generate_performance_test(self) -> Dict[str, Any]:
+        dtype = torch.float32
+        N = 8_000_000
+        return {
+            "A": torch.empty(N, device="cuda", dtype=dtype).uniform_(-1000.0, 1000.0),
+            "B": torch.empty(N, device="cuda", dtype=dtype).uniform_(-1000.0, 1000.0),
+            "C": torch.zeros(N, device="cuda", dtype=dtype),
+            "N": N,
+        }
@@ -0,0 +1,51 @@
+#include <cuda_runtime.h>
+#include <nccl.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static ncclComm_t get_comm() {
+    static ncclComm_t comm = nullptr;
+    if (comm)
+        return comm;
+
+    int rank = atoi(getenv("RANK"));
+    int world_size = atoi(getenv("WORLD_SIZE"));
+    const char* id_path = getenv("LEETGPU_NCCL_ID_FILE");
+
+    ncclUniqueId id;
+    FILE* f = fopen(id_path, "rb");
+    fread(&id, sizeof(id), 1, f);
+    fclose(f);
+
+    ncclCommInitRank(&comm, world_size, id, rank);
+    return comm;
+}
+
+__global__ void vector_add_slice(const float* A, const float* B, float* C, int lo, int hi) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int idx = lo + tid;
+    if (idx < hi)
+        C[idx] = A[idx] + B[idx];
+}
+
+extern "C" void solve(const float* A, const float* B, float* C, int N) {
+    int rank = atoi(getenv("RANK"));
+    int world_size = atoi(getenv("WORLD_SIZE"));
+    ncclComm_t comm = get_comm();
+
+    int chunk = (N + world_size - 1) / world_size;
+    int lo = rank * chunk;
+    int hi = lo + chunk > N ? N : lo + chunk;
+
+    cudaMemset(C, 0, N * sizeof(float));
+
+    int local = hi - lo;
+    if (local > 0) {
+        int block = 256;
+        int grid = (local + block - 1) / block;
+        vector_add_slice<<<grid, block>>>(A, B, C, lo, hi);
+    }
+
+    ncclAllReduce((const void*)C, (void*)C, N, ncclFloat, ncclSum, comm, 0);
+    cudaDeviceSynchronize();
+}
@@ -0,0 +1,20 @@
+import torch
+import torch.distributed as dist
+
+
+# For multi-gpu challenges, the runner passes torch tensors (so dist collectives work).
+# Inner CuTe kernels can be written with @cute.jit and called with from_dlpack conversion.
+def solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, N: int):
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    chunk = (N + world_size - 1) // world_size
+    lo = rank * chunk
+    hi = min(N, lo + chunk)
+
+    C.zero_()
+    if lo < hi:
+        # simple per-rank slice compute; a real cute kernel would live here.
+        C[lo:hi] = A[lo:hi] + B[lo:hi]
+
+    dist.all_reduce(C, op=dist.ReduceOp.SUM)
@@ -0,0 +1,12 @@
+import jax
+import jax.numpy as jnp
+
+
+# JAX multi-GPU: the runner calls jax.distributed.initialize before loading this
+# module, so jax.process_count() / jax.process_index() reflect the world.
+# Inputs are fully replicated across ranks (deterministic seeding). This trivial
+# solution computes the full result locally on each rank — cross-rank collectives
+# via jax.experimental.multihost_utils are also supported.
+@jax.jit
+def solve(A: jax.Array, B: jax.Array, N: int) -> jax.Array:
+    return A + B
@@ -0,0 +1,19 @@
+import os
+
+import torch
+import torch.distributed as dist
+
+
+def solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, N: int):
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+
+    chunk = (N + world_size - 1) // world_size
+    lo = rank * chunk
+    hi = min(N, lo + chunk)
+
+    C.zero_()
+    if lo < hi:
+        C[lo:hi] = A[lo:hi] + B[lo:hi]
+
+    dist.all_reduce(C, op=dist.ReduceOp.SUM)
@@ -0,0 +1,32 @@
+import torch
+import torch.distributed as dist
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def vector_add_kernel(A, B, C, lo, hi, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(0)
+    offs = lo + pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offs < hi
+    a = tl.load(A + offs, mask=mask)
+    b = tl.load(B + offs, mask=mask)
+    tl.store(C + offs, a + b, mask=mask)
+
+
+def solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, N: int):
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    chunk = (N + world_size - 1) // world_size
+    lo = rank * chunk
+    hi = min(N, lo + chunk)
+
+    C.zero_()
+    local = hi - lo
+    if local > 0:
+        BLOCK = 1024
+        grid = (triton.cdiv(local, BLOCK),)
+        vector_add_kernel[grid](A, B, C, lo, hi, BLOCK)
+
+    dist.all_reduce(C, op=dist.ReduceOp.SUM)
@@ -0,0 +1,55 @@
+#include <cuda_runtime.h>
+#include <nccl.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// A pre-generated NCCL unique id is written by the runner at LEETGPU_NCCL_ID_FILE
+// before solve() is called, so every rank can simply read it.
+static ncclComm_t get_comm() {
+    static ncclComm_t comm = nullptr;
+    if (comm)
+        return comm;
+
+    int rank = atoi(getenv("RANK"));
+    int world_size = atoi(getenv("WORLD_SIZE"));
+    const char* id_path = getenv("LEETGPU_NCCL_ID_FILE");
+
+    ncclUniqueId id;
+    FILE* f = fopen(id_path, "rb");
+    fread(&id, sizeof(id), 1, f);
+    fclose(f);
+
+    ncclCommInitRank(&comm, world_size, id, rank);
+    return comm;
+}
+
+__global__ void vector_add_slice(const float* A, const float* B, float* C, int lo, int hi) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int idx = lo + tid;
+    if (idx < hi)
+        C[idx] = A[idx] + B[idx];
+}
+
+// A, B, C are device pointers (inputs replicated across all ranks).
+// Each rank must leave the full result in C.
+extern "C" void solve(const float* A, const float* B, float* C, int N) {
+    int rank = atoi(getenv("RANK"));
+    int world_size = atoi(getenv("WORLD_SIZE"));
+    ncclComm_t comm = get_comm();
+
+    int chunk = (N + world_size - 1) / world_size;
+    int lo = rank * chunk;
+    int hi = lo + chunk > N ? N : lo + chunk;
+
+    cudaMemset(C, 0, N * sizeof(float));
+
+    int local = hi - lo;
+    if (local > 0) {
+        int block = 256;
+        int grid = (local + block - 1) / block;
+        vector_add_slice<<<grid, block>>>(A, B, C, lo, hi);
+    }
+
+    ncclAllReduce((const void*)C, (void*)C, N, ncclFloat, ncclSum, comm, 0);
+    cudaDeviceSynchronize();
+}
@@ -0,0 +1,26 @@
+import os
+
+import torch
+import torch.distributed as dist
+
+
+# A, B, C are tensors on the GPU. The same inputs are replicated across all ranks.
+# Each rank must leave the full result in C.
+#
+# The process group is already initialized by the runner (NCCL backend). Use
+# torch.distributed collectives to parallelize work across ranks.
+def solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, N: int):
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+
+    # Split [0, N) into world_size contiguous chunks and compute the local slice.
+    chunk = (N + world_size - 1) // world_size
+    lo = rank * chunk
+    hi = min(N, lo + chunk)
+
+    C.zero_()
+    if lo < hi:
+        C[lo:hi] = A[lo:hi] + B[lo:hi]
+
+    # Combine partial results so every rank holds the full C.
+    dist.all_reduce(C, op=dist.ReduceOp.SUM)