Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/create-challenge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:

- name: Install dependencies
run: |
pip install pre-commit requests websocket-client
pip install pre-commit requests websocket-client modular
pre-commit install

- name: Fetch open PRs
Expand Down
15 changes: 15 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,21 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'

- name: Install Modular (mojo)
run: |
python -m pip install --upgrade pip
pip install modular

- name: Check Mojo formatting with mojo format
run: |
find challenges -name "*.mojo" -type f -print0 | xargs -0 mojo format -q
git diff --exit-code -- '*.mojo'

- name: Check Mojo files exist and are valid
run: |
echo "Checking Mojo files for basic syntax issues..."
Expand Down
9 changes: 9 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,15 @@ repos:
types_or: [c++, c, cuda]
files: \.(cu|cpp|h)$

# Mojo formatting
- repo: local
hooks:
- id: mojo-format
name: mojo format
entry: mojo format -q
language: system
files: \.mojo$

# General file checks
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
Expand Down
8 changes: 3 additions & 5 deletions challenges/easy/19_reverse_array/starter/starter.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv


fn reverse_array_kernel(input: UnsafePointer[Float32, MutExternalOrigin], N: Int32):
pass


# input is a device pointer (i.e. pointer to memory on the GPU)
@export
fn solve(input: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:
Expand All @@ -15,10 +17,6 @@ fn solve(input: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:
var blocksPerGrid = ceildiv(N, threadsPerBlock)

var _kernel = ctx.compile_function[reverse_array_kernel, reverse_array_kernel]()
ctx.enqueue_function(_kernel,
input, N,
grid_dim = blocksPerGrid,
block_dim = threadsPerBlock
)
ctx.enqueue_function(_kernel, input, N, grid_dim=blocksPerGrid, block_dim=threadsPerBlock)

ctx.synchronize()
22 changes: 15 additions & 7 deletions challenges/easy/1_vector_add/starter/starter.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,29 @@ from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn vector_add_kernel(A: UnsafePointer[Float32, MutExternalOrigin], B: UnsafePointer[Float32, MutExternalOrigin], C: UnsafePointer[Float32, MutExternalOrigin], N: Int32):

fn vector_add_kernel(
A: UnsafePointer[Float32, MutExternalOrigin],
B: UnsafePointer[Float32, MutExternalOrigin],
C: UnsafePointer[Float32, MutExternalOrigin],
N: Int32,
):
pass


# A, B, C are device pointers (i.e. pointers to memory on the GPU)
@export
fn solve(A: UnsafePointer[Float32, MutExternalOrigin], B: UnsafePointer[Float32, MutExternalOrigin], C: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:
fn solve(
A: UnsafePointer[Float32, MutExternalOrigin],
B: UnsafePointer[Float32, MutExternalOrigin],
C: UnsafePointer[Float32, MutExternalOrigin],
N: Int32,
) raises:
var BLOCK_SIZE: Int32 = 256
var ctx = DeviceContext()
var num_blocks = ceildiv(N, BLOCK_SIZE)

var _kernel = ctx.compile_function[vector_add_kernel, vector_add_kernel]()
ctx.enqueue_function(_kernel,
A, B, C, N,
grid_dim = num_blocks,
block_dim = BLOCK_SIZE
)
ctx.enqueue_function(_kernel, A, B, C, N, grid_dim=num_blocks, block_dim=BLOCK_SIZE)

ctx.synchronize()
20 changes: 14 additions & 6 deletions challenges/easy/21_relu/starter/starter.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,30 @@ from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn relu_kernel(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32):

fn relu_kernel(
input: UnsafePointer[Float32, MutExternalOrigin],
output: UnsafePointer[Float32, MutExternalOrigin],
N: Int32,
):
pass


# input, output are device pointers (i.e. pointers to memory on the GPU)
@export
fn solve(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:
fn solve(
input: UnsafePointer[Float32, MutExternalOrigin],
output: UnsafePointer[Float32, MutExternalOrigin],
N: Int32,
) raises:
var threadsPerBlock: Int32 = 256
var ctx = DeviceContext()

var blocksPerGrid = ceildiv(N, threadsPerBlock)

var _kernel = ctx.compile_function[relu_kernel, relu_kernel]()
ctx.enqueue_function(_kernel,
input, output, N,
grid_dim = blocksPerGrid,
block_dim = threadsPerBlock
ctx.enqueue_function(
_kernel, input, output, N, grid_dim=blocksPerGrid, block_dim=threadsPerBlock
)

ctx.synchronize()
20 changes: 14 additions & 6 deletions challenges/easy/23_leaky_relu/starter/starter.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,30 @@ from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn leaky_relu_kernel(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32):

fn leaky_relu_kernel(
input: UnsafePointer[Float32, MutExternalOrigin],
output: UnsafePointer[Float32, MutExternalOrigin],
N: Int32,
):
pass


# input, output are device pointers (i.e. pointers to memory on the GPU)
@export
fn solve(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:
fn solve(
input: UnsafePointer[Float32, MutExternalOrigin],
output: UnsafePointer[Float32, MutExternalOrigin],
N: Int32,
) raises:
var threadsPerBlock: Int32 = 256
var ctx = DeviceContext()

var blocksPerGrid = ceildiv(N, threadsPerBlock)

var _kernel = ctx.compile_function[leaky_relu_kernel, leaky_relu_kernel]()
ctx.enqueue_function(_kernel,
input, output, N,
grid_dim = blocksPerGrid,
block_dim = threadsPerBlock
ctx.enqueue_function(
_kernel, input, output, N, grid_dim=blocksPerGrid, block_dim=threadsPerBlock
)

ctx.synchronize()
24 changes: 17 additions & 7 deletions challenges/easy/24_rainbow_table/starter/starter.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv


fn fnv1a_hash(input: Int32) -> UInt32:
alias FNV_PRIME: UInt32 = 16777619
alias OFFSET_BASIS: UInt32 = 2166136261
Expand All @@ -15,23 +16,32 @@ fn fnv1a_hash(input: Int32) -> UInt32:

return hash

fn fnv1a_hash_kernel(input: UnsafePointer[Int32, MutExternalOrigin], output: UnsafePointer[UInt32, MutExternalOrigin],
N: Int32, R: Int32):

fn fnv1a_hash_kernel(
input: UnsafePointer[Int32, MutExternalOrigin],
output: UnsafePointer[UInt32, MutExternalOrigin],
N: Int32,
R: Int32,
):
pass


# input, output are device pointers (i.e. pointers to memory on the GPU)
@export
fn solve(input: UnsafePointer[Int32, MutExternalOrigin], output: UnsafePointer[UInt32, MutExternalOrigin], N: Int32, R: Int32) raises:
fn solve(
input: UnsafePointer[Int32, MutExternalOrigin],
output: UnsafePointer[UInt32, MutExternalOrigin],
N: Int32,
R: Int32,
) raises:
var threadsPerBlock: Int32 = 256
var ctx = DeviceContext()

var blocksPerGrid = ceildiv(N, threadsPerBlock)

var _kernel = ctx.compile_function[fnv1a_hash_kernel, fnv1a_hash_kernel]()
ctx.enqueue_function(_kernel,
input, output, N, R,
grid_dim = blocksPerGrid,
block_dim = threadsPerBlock
ctx.enqueue_function(
_kernel, input, output, N, R, grid_dim=blocksPerGrid, block_dim=threadsPerBlock
)

ctx.synchronize()
34 changes: 28 additions & 6 deletions challenges/easy/2_matrix_multiplication/starter/starter.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,45 @@ from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn matrix_multiplication_kernel(A: UnsafePointer[Float32, MutExternalOrigin], B: UnsafePointer[Float32, MutExternalOrigin], C: UnsafePointer[Float32, MutExternalOrigin], M: Int32, N: Int32, K: Int32):

fn matrix_multiplication_kernel(
A: UnsafePointer[Float32, MutExternalOrigin],
B: UnsafePointer[Float32, MutExternalOrigin],
C: UnsafePointer[Float32, MutExternalOrigin],
M: Int32,
N: Int32,
K: Int32,
):
pass


# A, B, C are device pointers (i.e. pointers to memory on the GPU)
@export
fn solve(A: UnsafePointer[Float32, MutExternalOrigin], B: UnsafePointer[Float32, MutExternalOrigin], C: UnsafePointer[Float32, MutExternalOrigin], M: Int32, N: Int32, K: Int32) raises:
fn solve(
A: UnsafePointer[Float32, MutExternalOrigin],
B: UnsafePointer[Float32, MutExternalOrigin],
C: UnsafePointer[Float32, MutExternalOrigin],
M: Int32,
N: Int32,
K: Int32,
) raises:
var BLOCK_SIZE: Int32 = 16
var ctx = DeviceContext()

var grid_dim_x = ceildiv(K, BLOCK_SIZE)
var grid_dim_y = ceildiv(M, BLOCK_SIZE)

var _kernel = ctx.compile_function[matrix_multiplication_kernel, matrix_multiplication_kernel]()
ctx.enqueue_function(_kernel,
A, B, C, M, N, K,
grid_dim = (grid_dim_x, grid_dim_y),
block_dim = (BLOCK_SIZE, BLOCK_SIZE)
ctx.enqueue_function(
_kernel,
A,
B,
C,
M,
N,
K,
grid_dim=(grid_dim_x, grid_dim_y),
block_dim=(BLOCK_SIZE, BLOCK_SIZE),
)

ctx.synchronize()
20 changes: 13 additions & 7 deletions challenges/easy/31_matrix_copy/starter/starter.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,29 @@ from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn copy_matrix_kernel(A: UnsafePointer[Float32, MutExternalOrigin], B: UnsafePointer[Float32, MutExternalOrigin], N: Int32):

fn copy_matrix_kernel(
A: UnsafePointer[Float32, MutExternalOrigin],
B: UnsafePointer[Float32, MutExternalOrigin],
N: Int32,
):
pass


# A, B are device pointers (i.e. pointers to memory on the GPU)
@export
fn solve(A: UnsafePointer[Float32, MutExternalOrigin], B: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:
fn solve(
A: UnsafePointer[Float32, MutExternalOrigin],
B: UnsafePointer[Float32, MutExternalOrigin],
N: Int32,
) raises:
var total = N * N
var threadsPerBlock: Int32 = 256
var ctx = DeviceContext()

var blocksPerGrid = ceildiv(total, threadsPerBlock)

var _kernel = ctx.compile_function[copy_matrix_kernel, copy_matrix_kernel]()
ctx.enqueue_function(_kernel,
A, B, N,
grid_dim = blocksPerGrid,
block_dim = threadsPerBlock
)
ctx.enqueue_function(_kernel, A, B, N, grid_dim=blocksPerGrid, block_dim=threadsPerBlock)

ctx.synchronize()
28 changes: 22 additions & 6 deletions challenges/easy/3_matrix_transpose/starter/starter.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,39 @@ from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn matrix_transpose_kernel(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], rows: Int32, cols: Int32):

fn matrix_transpose_kernel(
input: UnsafePointer[Float32, MutExternalOrigin],
output: UnsafePointer[Float32, MutExternalOrigin],
rows: Int32,
cols: Int32,
):
pass


# input, output are device pointers (i.e. pointers to memory on the GPU)
@export
fn solve(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], rows: Int32, cols: Int32) raises:
fn solve(
input: UnsafePointer[Float32, MutExternalOrigin],
output: UnsafePointer[Float32, MutExternalOrigin],
rows: Int32,
cols: Int32,
) raises:
var BLOCK_SIZE: Int32 = 32
var ctx = DeviceContext()

var grid_dim_x = ceildiv(cols, BLOCK_SIZE)
var grid_dim_y = ceildiv(rows, BLOCK_SIZE)

var _kernel = ctx.compile_function[matrix_transpose_kernel, matrix_transpose_kernel]()
ctx.enqueue_function(_kernel,
input, output, rows, cols,
grid_dim = (grid_dim_x, grid_dim_y),
block_dim = (BLOCK_SIZE, BLOCK_SIZE)
ctx.enqueue_function(
_kernel,
input,
output,
rows,
cols,
grid_dim=(grid_dim_x, grid_dim_y),
block_dim=(BLOCK_SIZE, BLOCK_SIZE),
)

ctx.synchronize()
Loading
Loading