From 40b8bd93fb776c075ba10cc3cf7b3b2e7f992843 Mon Sep 17 00:00:00 2001 From: Eamon Date: Mon, 1 Jun 2026 01:00:08 +0530 Subject: [PATCH 1/6] feat(cuda): add attention forward backward kernel declarations (#64) * docs: report [run_20260530_165216] (~791 tok/s) Includes metrics for generalization gap, throughput (~791 tok/s), and gradient norms. Parameters: 6.68M | lr: 1e-3 | batch: 16 | steps: 6000 - Achieved best validation loss of 4.1319 at step 3900 * docs:report [run_20260530_165216](~791 tok/s) (#61) Includes metrics for generalization gap, throughput (~791 tok/s), and gradient norms. Parameters: 6.68M | lr: 1e-3 | batch: 16 | steps: 6000 - Achieved best validation loss of 4.1319 at step 3900 Co-authored-by: Max * feat(cuda): add attention forward and backward kernel declarations Introduces the header declarations for `attention_forward` and `attention_backward` operations inside the `quadtrix::cuda` namespace. Configured with support for custom CUDA streams and head partitioning. --------- Co-authored-by: Max --- CUDA/includes/attention.cuh | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 CUDA/includes/attention.cuh diff --git a/CUDA/includes/attention.cuh b/CUDA/includes/attention.cuh new file mode 100644 index 0000000..7feac08 --- /dev/null +++ b/CUDA/includes/attention.cuh @@ -0,0 +1,29 @@ +#pragma once + +#include "tensor.cuh" + +#include + +namespace quadtrix { +namespace cuda { + +Status attention_forward( + const TensorView& input_qkv, + TensorView preatt, + TensorView att, + TensorView output, + int num_heads, + cudaStream_t stream = nullptr); + +Status attention_backward( + const TensorView& grad_output, + const TensorView& input_qkv, + const TensorView& att, + TensorView grad_input_qkv, + TensorView grad_preatt, + TensorView grad_att, + int num_heads, + cudaStream_t stream = nullptr); + +} // namespace cuda +} // namespace quadtrix From 4aac832e725f1ec5b2136b3167bfa7028e714ee5 Mon Sep 17 00:00:00 2001 From: Eamon Date: Mon, 1 Jun 2026 22:30:58 +0530 Subject: [PATCH 2/6] feat(cuda): add checkpoint metadata struct and stub functions --- CUDA/includes/checkpoint.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 CUDA/includes/checkpoint.h diff --git a/CUDA/includes/checkpoint.h b/CUDA/includes/checkpoint.h new file mode 100644 index 0000000..ba91b0f --- /dev/null +++ b/CUDA/includes/checkpoint.h @@ -0,0 +1,25 @@ +#pragma once + +#include "tensor.cuh" + +namespace quadtrix { +namespace cuda { + +struct CheckpointMetadata { + int vocab_size = 0; + int max_sequence_length = 0; + int num_layers = 0; + int num_heads = 0; + int channels = 0; +}; + +inline bool load_checkpoint_metadata(const char*, CheckpointMetadata*) { + return false; +} + +inline bool save_tensor_checkpoint(const char*, const TensorView&) { + return false; +} + +} // namespace cuda +} // namespace quadtrix From 47696058b34c95c45e715fb7b25dcec5a28ea955 Mon Sep 17 00:00:00 2001 From: Eamon Date: Mon, 1 Jun 2026 22:34:04 +0530 Subject: [PATCH 3/6] feat(cuda): introduce core type definitions and error handling utilities - Defines `DType` and `DeviceKind` enums supporting standard types (F32, F16, BF16, I32, U8). - Implements `dtype_name` and `dtype_size` metadata helper functions. - Adds an explicit `Status` struct for non-throwing error propagation alongside `checked_mul` for safe allocation size computation. - Introduces `check_cuda` and `abort_on_cuda` error macros and handling mechanisms, exposed via the `QUADTRIX_CUDA_CHECK` macro. --- CUDA/includes/common.h | 120 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 CUDA/includes/common.h diff --git a/CUDA/includes/common.h b/CUDA/includes/common.h new file mode 100644 index 0000000..36df155 --- /dev/null +++ b/CUDA/includes/common.h @@ -0,0 +1,120 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace quadtrix { +namespace cuda { + +enum class DType : std::uint8_t { + F32, + F16, + BF16, + I32, + U8, +}; + +enum class DeviceKind : std::uint8_t { + CPU, + CUDA, +}; + +struct Status { + bool ok; + cudaError_t cuda_error; + const char* message; + + static Status success() { + return {true, cudaSuccess, "ok"}; + } + + static Status failure(cudaError_t error, const char* message) { + return {false, error, message}; + } +}; + +inline const char* dtype_name(DType dtype) { + switch (dtype) { + case DType::F32: + return "f32"; + case DType::F16: + return "f16"; + case DType::BF16: + return "bf16"; + case DType::I32: + return "i32"; + case DType::U8: + return "u8"; + } + return "unknown"; +} + +inline std::size_t dtype_size(DType dtype) { + switch (dtype) { + case DType::F32: + return 4; + case DType::F16: + return 2; + case DType::BF16: + return 2; + case DType::I32: + return 4; + case DType::U8: + return 1; + } + + std::fprintf(stderr, "Unknown CUDA dtype value %u\n", static_cast(dtype)); + std::abort(); +} + +inline bool checked_mul(std::size_t lhs, std::size_t rhs, std::size_t* out) { + if (lhs != 0 && rhs > std::numeric_limits::max() / lhs) { + return false; + } + *out = lhs * rhs; + return true; +} + +inline Status check_cuda(cudaError_t error, const char* expression, const char* file, int line) { + if (error == cudaSuccess) { + return Status::success(); + } + + std::fprintf( + stderr, + "CUDA error at %s:%d: %s failed with %s\n", + file, + line, + expression, + cudaGetErrorString(error)); + return Status::failure(error, expression); +} + +inline void abort_on_cuda(cudaError_t error, const char* expression, const char* file, int line) { + if (error == cudaSuccess) { + return; + } + + std::fprintf( + stderr, + "Fatal CUDA error at %s:%d: %s failed with %s\n", + file, + line, + expression, + cudaGetErrorString(error)); + std::abort(); +} + +} // namespace cuda +} // namespace quadtrix + +#define QUADTRIX_CUDA_CHECK(expr) \ + ::quadtrix::cuda::check_cuda((expr), #expr, __FILE__, __LINE__) + +#define QUADTRIX_CUDA_ABORT(expr) \ + ::quadtrix::cuda::abort_on_cuda((expr), #expr, __FILE__, __LINE__) From 7c94958781dddc8d38a30d34dd343a00417c7fc7 Mon Sep 17 00:00:00 2001 From: Eamon Date: Mon, 1 Jun 2026 22:34:39 +0530 Subject: [PATCH 4/6] feat(cuda): add TokenBatchView struct and DataLoader stub class --- CUDA/includes/dataloader.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 CUDA/includes/dataloader.h diff --git a/CUDA/includes/dataloader.h b/CUDA/includes/dataloader.h new file mode 100644 index 0000000..fd3c47d --- /dev/null +++ b/CUDA/includes/dataloader.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include + +namespace quadtrix { +namespace cuda { + +struct TokenBatchView { + const std::int32_t* inputs = nullptr; + const std::int32_t* targets = nullptr; + int batch_size = 0; + int sequence_length = 0; +}; + +class DataLoader { +public: + DataLoader() = default; + + bool next(TokenBatchView* batch) { + if (batch != nullptr) { + *batch = {}; + } + return false; + } +}; + +} // namespace cuda +} // namespace quadtrix From c62c869527bcf83ab494341b1667b7ac95e9af95 Mon Sep 17 00:00:00 2001 From: Eamon Date: Mon, 1 Jun 2026 22:35:34 +0530 Subject: [PATCH 5/6] feat(cuda): add GeLU activation forward and backward declarations - Introduces the `GeluMode` enum to toggle between `Exact` and `Approximate` mathematical variants. - Declares the `gelu_forward` and `gelu_backward` kernel entrypoints. - Configures both signatures with optional stream execution and a default mode of `GeluMode::Approximate`. --- CUDA/includes/gelu.cuh | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 CUDA/includes/gelu.cuh diff --git a/CUDA/includes/gelu.cuh b/CUDA/includes/gelu.cuh new file mode 100644 index 0000000..af87e64 --- /dev/null +++ b/CUDA/includes/gelu.cuh @@ -0,0 +1,31 @@ +#pragma once + +#include "tensor.cuh" + +#include + +#include + +namespace quadtrix { +namespace cuda { + +enum class GeluMode : std::uint8_t { + Exact, + Approximate, +}; + +Status gelu_forward( + const TensorView& input, + TensorView output, + GeluMode mode = GeluMode::Approximate, + cudaStream_t stream = nullptr); + +Status gelu_backward( + const TensorView& grad_output, + const TensorView& input, + TensorView grad_input, + GeluMode mode = GeluMode::Approximate, + cudaStream_t stream = nullptr); + +} // namespace cuda +} // namespace quadtrix From 28117dc6f6e5bb2be6544f0a9007043a943686c1 Mon Sep 17 00:00:00 2001 From: Eamon Date: Mon, 1 Jun 2026 22:47:36 +0530 Subject: [PATCH 6/6] feat(cuda): add gradient norm calculation and clipping interfaces --- CUDA/includes/global_norm.cuh | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 CUDA/includes/global_norm.cuh diff --git a/CUDA/includes/global_norm.cuh b/CUDA/includes/global_norm.cuh new file mode 100644 index 0000000..f418ab7 --- /dev/null +++ b/CUDA/includes/global_norm.cuh @@ -0,0 +1,26 @@ +#pragma once + +#include "tensor.cuh" + +#include + +namespace quadtrix { +namespace cuda { + +Status global_norm_squared( + const TensorView& grads, + TensorView partial_sums, + cudaStream_t stream = nullptr); + +Status clip_gradients_by_global_norm( + TensorView grads, + float global_norm, + float max_norm, + cudaStream_t stream = nullptr); + +inline float clip_scale(float global_norm, float max_norm) { + return global_norm > max_norm && global_norm > 0.0f ? max_norm / global_norm : 1.0f; +} + +} // namespace cuda +} // namespace quadtrix