Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions challenges/easy/100_multi_gpu_vector_add/challenge.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<h2>Multi-GPU Vector Addition</h2>

<p>
Compute the element-wise sum of two vectors <code>A</code> and <code>B</code>
into output vector <code>C</code> on <strong>2 GPUs</strong>.
</p>

<p>
Your <code>solve()</code> function is invoked once on each rank. The same
input tensors <code>A</code> and <code>B</code> are available on every rank
(fully replicated), and each rank must produce the full result in its own
<code>C</code>. You may parallelize the work across ranks and use collectives
(PyTorch: <code>torch.distributed</code>; CUDA: NCCL with the rank/world-size
and NCCL unique-id file exposed via environment variables) to combine partial
results.
</p>

<p>Environment variables available inside <code>solve()</code>:</p>
<ul>
<li><code>RANK</code>: this rank's index (0-based)</li>
<li><code>WORLD_SIZE</code>: total number of ranks (2 for this challenge)</li>
<li><code>LOCAL_RANK</code>: local device index for this rank</li>
<li><code>LEETGPU_NCCL_ID_FILE</code>: (CUDA only) path to a shared file for
bootstrapping NCCL via <code>ncclGetUniqueId</code> /
<code>ncclCommInitRank</code></li>
</ul>

<p>
For PyTorch solutions the process group (<code>nccl</code> backend) is
pre-initialized by the runner — just call
<code>torch.distributed.all_reduce</code>,
<code>torch.distributed.all_gather</code>, etc.
</p>

<h3>Constraints</h3>
<ul>
<li><code>1 &le; N &le; 10<sup>7</sup></code></li>
<li><code>A</code>, <code>B</code>, <code>C</code> are contiguous
<code>float32</code> tensors on the GPU</li>
</ul>

<p><strong>This is a Pro-only multi-GPU challenge.</strong></p>
68 changes: 68 additions & 0 deletions challenges/easy/100_multi_gpu_vector_add/challenge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import ctypes
from typing import Any, Dict, List

import torch
from core.challenge_base import ChallengeBase


class Challenge(ChallengeBase):
def __init__(self):
super().__init__(
name="Multi-GPU Vector Addition",
atol=1e-05,
rtol=1e-05,
num_gpus=2,
access_tier="pro",
)

def reference_impl(self, A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, N: int):
assert A.shape == B.shape == C.shape
assert A.dtype == B.dtype == C.dtype
assert A.device == B.device == C.device

torch.add(A, B, out=C)

def get_solve_signature(self) -> Dict[str, tuple]:
return {
"A": (ctypes.POINTER(ctypes.c_float), "in"),
"B": (ctypes.POINTER(ctypes.c_float), "in"),
"C": (ctypes.POINTER(ctypes.c_float), "out"),
"N": (ctypes.c_size_t, "in"),
}

def generate_example_test(self) -> Dict[str, Any]:
dtype = torch.float32
N = 8
A = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], device="cuda", dtype=dtype)
B = torch.tensor([8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0], device="cuda", dtype=dtype)
C = torch.empty(N, device="cuda", dtype=dtype)
return {
"A": A,
"B": B,
"C": C,
"N": N,
}

def generate_functional_test(self) -> List[Dict[str, Any]]:
dtype = torch.float32
test_cases = []
for size in (8, 64, 1024, 8192):
test_cases.append(
{
"A": torch.empty(size, device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
"B": torch.empty(size, device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
"C": torch.zeros(size, device="cuda", dtype=dtype),
"N": size,
}
)
return test_cases

def generate_performance_test(self) -> Dict[str, Any]:
dtype = torch.float32
N = 8_000_000
return {
"A": torch.empty(N, device="cuda", dtype=dtype).uniform_(-1000.0, 1000.0),
"B": torch.empty(N, device="cuda", dtype=dtype).uniform_(-1000.0, 1000.0),
"C": torch.zeros(N, device="cuda", dtype=dtype),
"N": N,
}
51 changes: 51 additions & 0 deletions challenges/easy/100_multi_gpu_vector_add/solution/solution.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#include <cuda_runtime.h>
#include <nccl.h>
#include <stdio.h>
#include <stdlib.h>

static ncclComm_t get_comm() {
static ncclComm_t comm = nullptr;
if (comm)
return comm;

int rank = atoi(getenv("RANK"));
int world_size = atoi(getenv("WORLD_SIZE"));
const char* id_path = getenv("LEETGPU_NCCL_ID_FILE");

ncclUniqueId id;
FILE* f = fopen(id_path, "rb");
fread(&id, sizeof(id), 1, f);
fclose(f);

ncclCommInitRank(&comm, world_size, id, rank);
return comm;
}

__global__ void vector_add_slice(const float* A, const float* B, float* C, int lo, int hi) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int idx = lo + tid;
if (idx < hi)
C[idx] = A[idx] + B[idx];
}

extern "C" void solve(const float* A, const float* B, float* C, int N) {
int rank = atoi(getenv("RANK"));
int world_size = atoi(getenv("WORLD_SIZE"));
ncclComm_t comm = get_comm();

int chunk = (N + world_size - 1) / world_size;
int lo = rank * chunk;
int hi = lo + chunk > N ? N : lo + chunk;

cudaMemset(C, 0, N * sizeof(float));

int local = hi - lo;
if (local > 0) {
int block = 256;
int grid = (local + block - 1) / block;
vector_add_slice<<<grid, block>>>(A, B, C, lo, hi);
}

ncclAllReduce((const void*)C, (void*)C, N, ncclFloat, ncclSum, comm, 0);
cudaDeviceSynchronize();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import torch
import torch.distributed as dist


# For multi-gpu challenges, the runner passes torch tensors (so dist collectives work).
# Inner CuTe kernels can be written with @cute.jit and called with from_dlpack conversion.
def solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, N: int):
rank = dist.get_rank()
world_size = dist.get_world_size()

chunk = (N + world_size - 1) // world_size
lo = rank * chunk
hi = min(N, lo + chunk)

C.zero_()
if lo < hi:
# simple per-rank slice compute; a real cute kernel would live here.
C[lo:hi] = A[lo:hi] + B[lo:hi]

dist.all_reduce(C, op=dist.ReduceOp.SUM)
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import jax
import jax.numpy as jnp


# JAX multi-GPU: the runner calls jax.distributed.initialize before loading this
# module, so jax.process_count() / jax.process_index() reflect the world.
# Inputs are fully replicated across ranks (deterministic seeding). This trivial
# solution computes the full result locally on each rank — cross-rank collectives
# via jax.experimental.multihost_utils are also supported.
@jax.jit
def solve(A: jax.Array, B: jax.Array, N: int) -> jax.Array:
return A + B
19 changes: 19 additions & 0 deletions challenges/easy/100_multi_gpu_vector_add/solution/solution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import os

import torch
import torch.distributed as dist


def solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, N: int):
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])

chunk = (N + world_size - 1) // world_size
lo = rank * chunk
hi = min(N, lo + chunk)

C.zero_()
if lo < hi:
C[lo:hi] = A[lo:hi] + B[lo:hi]

dist.all_reduce(C, op=dist.ReduceOp.SUM)
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import torch
import torch.distributed as dist
import triton
import triton.language as tl


@triton.jit
def vector_add_kernel(A, B, C, lo, hi, BLOCK_SIZE: tl.constexpr):
pid = tl.program_id(0)
offs = lo + pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
mask = offs < hi
a = tl.load(A + offs, mask=mask)
b = tl.load(B + offs, mask=mask)
tl.store(C + offs, a + b, mask=mask)


def solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, N: int):
rank = dist.get_rank()
world_size = dist.get_world_size()

chunk = (N + world_size - 1) // world_size
lo = rank * chunk
hi = min(N, lo + chunk)

C.zero_()
local = hi - lo
if local > 0:
BLOCK = 1024
grid = (triton.cdiv(local, BLOCK),)
vector_add_kernel[grid](A, B, C, lo, hi, BLOCK)

dist.all_reduce(C, op=dist.ReduceOp.SUM)
55 changes: 55 additions & 0 deletions challenges/easy/100_multi_gpu_vector_add/starter/starter.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include <cuda_runtime.h>
#include <nccl.h>
#include <stdio.h>
#include <stdlib.h>

// A pre-generated NCCL unique id is written by the runner at LEETGPU_NCCL_ID_FILE
// before solve() is called, so every rank can simply read it.
static ncclComm_t get_comm() {
static ncclComm_t comm = nullptr;
if (comm)
return comm;

int rank = atoi(getenv("RANK"));
int world_size = atoi(getenv("WORLD_SIZE"));
const char* id_path = getenv("LEETGPU_NCCL_ID_FILE");

ncclUniqueId id;
FILE* f = fopen(id_path, "rb");
fread(&id, sizeof(id), 1, f);
fclose(f);

ncclCommInitRank(&comm, world_size, id, rank);
return comm;
}

__global__ void vector_add_slice(const float* A, const float* B, float* C, int lo, int hi) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int idx = lo + tid;
if (idx < hi)
C[idx] = A[idx] + B[idx];
}

// A, B, C are device pointers (inputs replicated across all ranks).
// Each rank must leave the full result in C.
extern "C" void solve(const float* A, const float* B, float* C, int N) {
int rank = atoi(getenv("RANK"));
int world_size = atoi(getenv("WORLD_SIZE"));
ncclComm_t comm = get_comm();

int chunk = (N + world_size - 1) / world_size;
int lo = rank * chunk;
int hi = lo + chunk > N ? N : lo + chunk;

cudaMemset(C, 0, N * sizeof(float));

int local = hi - lo;
if (local > 0) {
int block = 256;
int grid = (local + block - 1) / block;
vector_add_slice<<<grid, block>>>(A, B, C, lo, hi);
}

ncclAllReduce((const void*)C, (void*)C, N, ncclFloat, ncclSum, comm, 0);
cudaDeviceSynchronize();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os

import torch
import torch.distributed as dist


# A, B, C are tensors on the GPU. The same inputs are replicated across all ranks.
# Each rank must leave the full result in C.
#
# The process group is already initialized by the runner (NCCL backend). Use
# torch.distributed collectives to parallelize work across ranks.
def solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, N: int):
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])

# Split [0, N) into world_size contiguous chunks and compute the local slice.
chunk = (N + world_size - 1) // world_size
lo = rank * chunk
hi = min(N, lo + chunk)

C.zero_()
if lo < hi:
C[lo:hi] = A[lo:hi] + B[lo:hi]

# Combine partial results so every rank holds the full C.
dist.all_reduce(C, op=dist.ReduceOp.SUM)
Loading
Loading