Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions challenges/easy/19_reverse_array/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv
from std.gpu.host import DeviceContext
from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn reverse_array_kernel(input: UnsafePointer[Float32], N: Int32):
fn reverse_array_kernel(input: UnsafePointer[Float32, MutExternalOrigin], N: Int32):
pass

# input is a device pointer (i.e. pointer to memory on the GPU)
@export
def solve(input: UnsafePointer[Float32], N: Int32):
fn solve(input: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:
var threadsPerBlock: Int32 = 256
var ctx = DeviceContext()

var blocksPerGrid = ceildiv(N, threadsPerBlock)

ctx.enqueue_function[reverse_array_kernel](
var _kernel = ctx.compile_function[reverse_array_kernel, reverse_array_kernel]()
ctx.enqueue_function(_kernel,
input, N,
grid_dim = blocksPerGrid,
block_dim = threadsPerBlock
Expand Down
15 changes: 8 additions & 7 deletions challenges/easy/1_vector_add/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv
from std.gpu.host import DeviceContext
from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn vector_add_kernel(A: UnsafePointer[Float32], B: UnsafePointer[Float32], C: UnsafePointer[Float32], N: Int32):
fn vector_add_kernel(A: UnsafePointer[Float32, MutExternalOrigin], B: UnsafePointer[Float32, MutExternalOrigin], C: UnsafePointer[Float32, MutExternalOrigin], N: Int32):
pass

# A, B, C are device pointers (i.e. pointers to memory on the GPU)
@export
def solve(A: UnsafePointer[Float32], B: UnsafePointer[Float32], C: UnsafePointer[Float32], N: Int32):
fn solve(A: UnsafePointer[Float32, MutExternalOrigin], B: UnsafePointer[Float32, MutExternalOrigin], C: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:
var BLOCK_SIZE: Int32 = 256
var ctx = DeviceContext()
var num_blocks = ceildiv(N, BLOCK_SIZE)

ctx.enqueue_function[vector_add_kernel](
var _kernel = ctx.compile_function[vector_add_kernel, vector_add_kernel]()
ctx.enqueue_function(_kernel,
A, B, C, N,
grid_dim = num_blocks,
block_dim = BLOCK_SIZE
Expand Down
15 changes: 8 additions & 7 deletions challenges/easy/21_relu/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv
from std.gpu.host import DeviceContext
from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn relu_kernel(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32):
fn relu_kernel(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32):
pass

# input, output are device pointers (i.e. pointers to memory on the GPU)
@export
def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32):
fn solve(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:
var threadsPerBlock: Int32 = 256
var ctx = DeviceContext()

var blocksPerGrid = ceildiv(N, threadsPerBlock)

ctx.enqueue_function[relu_kernel](
var _kernel = ctx.compile_function[relu_kernel, relu_kernel]()
ctx.enqueue_function(_kernel,
input, output, N,
grid_dim = blocksPerGrid,
block_dim = threadsPerBlock
Expand Down
15 changes: 8 additions & 7 deletions challenges/easy/23_leaky_relu/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv
from std.gpu.host import DeviceContext
from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn leaky_relu_kernel(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32):
fn leaky_relu_kernel(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32):
pass

# input, output are device pointers (i.e. pointers to memory on the GPU)
@export
def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32):
fn solve(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:
var threadsPerBlock: Int32 = 256
var ctx = DeviceContext()

var blocksPerGrid = ceildiv(N, threadsPerBlock)

ctx.enqueue_function[leaky_relu_kernel](
var _kernel = ctx.compile_function[leaky_relu_kernel, leaky_relu_kernel]()
ctx.enqueue_function(_kernel,
input, output, N,
grid_dim = blocksPerGrid,
block_dim = threadsPerBlock
Expand Down
15 changes: 8 additions & 7 deletions challenges/easy/24_rainbow_table/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv
from std.gpu.host import DeviceContext
from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn fnv1a_hash(input: Int32) -> UInt32:
alias FNV_PRIME: UInt32 = 16777619
Expand All @@ -15,19 +15,20 @@ fn fnv1a_hash(input: Int32) -> UInt32:

return hash

fn fnv1a_hash_kernel(input: UnsafePointer[Int32], output: UnsafePointer[UInt32],
fn fnv1a_hash_kernel(input: UnsafePointer[Int32, MutExternalOrigin], output: UnsafePointer[UInt32, MutExternalOrigin],
N: Int32, R: Int32):
pass

# input, output are device pointers (i.e. pointers to memory on the GPU)
@export
def solve(input: UnsafePointer[Int32], output: UnsafePointer[UInt32], N: Int32, R: Int32):
fn solve(input: UnsafePointer[Int32, MutExternalOrigin], output: UnsafePointer[UInt32, MutExternalOrigin], N: Int32, R: Int32) raises:
var threadsPerBlock: Int32 = 256
var ctx = DeviceContext()

var blocksPerGrid = ceildiv(N, threadsPerBlock)

ctx.enqueue_function[fnv1a_hash_kernel](
var _kernel = ctx.compile_function[fnv1a_hash_kernel, fnv1a_hash_kernel]()
ctx.enqueue_function(_kernel,
input, output, N, R,
grid_dim = blocksPerGrid,
block_dim = threadsPerBlock
Expand Down
15 changes: 8 additions & 7 deletions challenges/easy/2_matrix_multiplication/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math.math import ceildiv
from std.gpu.host import DeviceContext
from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn matrix_multiplication_kernel(A: UnsafePointer[Float32], B: UnsafePointer[Float32], C: UnsafePointer[Float32], M: Int32, N: Int32, K: Int32):
fn matrix_multiplication_kernel(A: UnsafePointer[Float32, MutExternalOrigin], B: UnsafePointer[Float32, MutExternalOrigin], C: UnsafePointer[Float32, MutExternalOrigin], M: Int32, N: Int32, K: Int32):
pass

# A, B, C are device pointers (i.e. pointers to memory on the GPU)
@export
def solve(A: UnsafePointer[Float32], B: UnsafePointer[Float32], C: UnsafePointer[Float32], M: Int32, N: Int32, K: Int32):
fn solve(A: UnsafePointer[Float32, MutExternalOrigin], B: UnsafePointer[Float32, MutExternalOrigin], C: UnsafePointer[Float32, MutExternalOrigin], M: Int32, N: Int32, K: Int32) raises:
var BLOCK_SIZE: Int32 = 16
var ctx = DeviceContext()

var grid_dim_x = ceildiv(K, BLOCK_SIZE)
var grid_dim_y = ceildiv(M, BLOCK_SIZE)

ctx.enqueue_function[matrix_multiplication_kernel](
var _kernel = ctx.compile_function[matrix_multiplication_kernel, matrix_multiplication_kernel]()
ctx.enqueue_function(_kernel,
A, B, C, M, N, K,
grid_dim = (grid_dim_x, grid_dim_y),
block_dim = (BLOCK_SIZE, BLOCK_SIZE)
Expand Down
15 changes: 8 additions & 7 deletions challenges/easy/31_matrix_copy/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv
from std.gpu.host import DeviceContext
from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn copy_matrix_kernel(A: UnsafePointer[Float32], B: UnsafePointer[Float32], N: Int32):
fn copy_matrix_kernel(A: UnsafePointer[Float32, MutExternalOrigin], B: UnsafePointer[Float32, MutExternalOrigin], N: Int32):
pass

# A, B are device pointers (i.e. pointers to memory on the GPU)
@export
def solve(A: UnsafePointer[Float32], B: UnsafePointer[Float32], N: Int32):
fn solve(A: UnsafePointer[Float32, MutExternalOrigin], B: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:
var total = N * N
var threadsPerBlock: Int32 = 256
var ctx = DeviceContext()

var blocksPerGrid = ceildiv(total, threadsPerBlock)

ctx.enqueue_function[copy_matrix_kernel](
var _kernel = ctx.compile_function[copy_matrix_kernel, copy_matrix_kernel]()
ctx.enqueue_function(_kernel,
A, B, N,
grid_dim = blocksPerGrid,
block_dim = threadsPerBlock
Expand Down
15 changes: 8 additions & 7 deletions challenges/easy/3_matrix_transpose/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv
from std.gpu.host import DeviceContext
from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn matrix_transpose_kernel(input: UnsafePointer[Float32], output: UnsafePointer[Float32], rows: Int32, cols: Int32):
fn matrix_transpose_kernel(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], rows: Int32, cols: Int32):
pass

# input, output are device pointers (i.e. pointers to memory on the GPU)
@export
def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], rows: Int32, cols: Int32):
fn solve(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], rows: Int32, cols: Int32) raises:
var BLOCK_SIZE: Int32 = 32
var ctx = DeviceContext()

var grid_dim_x = ceildiv(cols, BLOCK_SIZE)
var grid_dim_y = ceildiv(rows, BLOCK_SIZE)

ctx.enqueue_function[matrix_transpose_kernel](
var _kernel = ctx.compile_function[matrix_transpose_kernel, matrix_transpose_kernel]()
ctx.enqueue_function(_kernel,
input, output, rows, cols,
grid_dim = (grid_dim_x, grid_dim_y),
block_dim = (BLOCK_SIZE, BLOCK_SIZE)
Expand Down
15 changes: 8 additions & 7 deletions challenges/easy/52_silu/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv
from std.gpu.host import DeviceContext
from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn silu_kernel(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32):
fn silu_kernel(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32):
pass

# input, output are device pointers
@export
def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32):
fn solve(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:
var threadsPerBlock: Int32 = 256
var ctx = DeviceContext()

var blocksPerGrid = ceildiv(N, threadsPerBlock)

ctx.enqueue_function[silu_kernel](
var _kernel = ctx.compile_function[silu_kernel, silu_kernel]()
ctx.enqueue_function(_kernel,
input, output, N,
grid_dim = blocksPerGrid,
block_dim = threadsPerBlock
Expand Down
15 changes: 8 additions & 7 deletions challenges/easy/54_swiglu/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv
from std.gpu.host import DeviceContext
from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv


fn swiglu_kernel(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32):
fn swiglu_kernel(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32):
pass


# input, output are device pointers
@export
def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32):
fn solve(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:
var BLOCK_SIZE: Int32 = 256
var ctx = DeviceContext()
var num_blocks = ceildiv(N // 2, BLOCK_SIZE)

ctx.enqueue_function[swiglu_kernel](
var _kernel = ctx.compile_function[swiglu_kernel, swiglu_kernel]()
ctx.enqueue_function(_kernel,
input, output, N,
grid_dim = num_blocks,
block_dim = BLOCK_SIZE
Expand Down
15 changes: 8 additions & 7 deletions challenges/easy/62_value_clipping/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv
from std.gpu.host import DeviceContext
from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn clip_kernel(input: UnsafePointer[Float32], output: UnsafePointer[Float32], lo: Float32, hi: Float32, N: Int32):
fn clip_kernel(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], lo: Float32, hi: Float32, N: Int32):
pass


# input, output are device pointers
@export
def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], lo: Float32, hi: Float32, N: Int32):
fn solve(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], lo: Float32, hi: Float32, N: Int32) raises:
var BLOCK_SIZE: Int32 = 256
var ctx = DeviceContext()
var num_blocks = ceildiv(N, BLOCK_SIZE)

ctx.enqueue_function[clip_kernel](
var _kernel = ctx.compile_function[clip_kernel, clip_kernel]()
ctx.enqueue_function(_kernel,
input, output, lo, hi, N,
grid_dim = num_blocks,
block_dim = BLOCK_SIZE
Expand Down
15 changes: 8 additions & 7 deletions challenges/easy/63_interleave/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv
from std.gpu.host import DeviceContext
from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv

fn interleave_kernel(A: UnsafePointer[Float32], B: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32):
fn interleave_kernel(A: UnsafePointer[Float32, MutExternalOrigin], B: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32):
pass

# A, B, output are device pointers (i.e. pointers to memory on the GPU)
@export
def solve(A: UnsafePointer[Float32], B: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32):
fn solve(A: UnsafePointer[Float32, MutExternalOrigin], B: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:
var BLOCK_SIZE: Int32 = 256
var ctx = DeviceContext()
var num_blocks = ceildiv(N, BLOCK_SIZE)

ctx.enqueue_function[interleave_kernel](
var _kernel = ctx.compile_function[interleave_kernel, interleave_kernel]()
ctx.enqueue_function(_kernel,
A, B, output, N,
grid_dim = num_blocks,
block_dim = BLOCK_SIZE
Expand Down
15 changes: 8 additions & 7 deletions challenges/easy/65_geglu/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv
from std.gpu.host import DeviceContext
from std.gpu import block_dim, block_idx, thread_idx
from std.memory import UnsafePointer
from std.math import ceildiv


fn geglu_kernel(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32):
fn geglu_kernel(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32):
pass


# input, output are device pointers
@export
def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32):
fn solve(input: UnsafePointer[Float32, MutExternalOrigin], output: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:
var BLOCK_SIZE: Int32 = 256
var ctx = DeviceContext()
var num_blocks = ceildiv(N // 2, BLOCK_SIZE)

ctx.enqueue_function[geglu_kernel](
var _kernel = ctx.compile_function[geglu_kernel, geglu_kernel]()
ctx.enqueue_function(_kernel,
input, output, N,
grid_dim = num_blocks,
block_dim = BLOCK_SIZE
Expand Down
Loading
Loading