From 40b8bd93fb776c075ba10cc3cf7b3b2e7f992843 Mon Sep 17 00:00:00 2001
From: Eamon <eamon112009@gmail.com>
Date: Mon, 1 Jun 2026 01:00:08 +0530
Subject: [PATCH 1/6] feat(cuda): add attention forward backward kernel
 declarations (#64)

* docs: report [run_20260530_165216] (~791 tok/s)

 Includes metrics for generalization gap, throughput (~791 tok/s), and gradient norms.
Parameters: 6.68M | lr: 1e-3 | batch: 16 | steps: 6000 - Achieved best validation loss of 4.1319 at step 3900

* docs:report [run_20260530_165216](~791 tok/s)  (#61)

Includes metrics for generalization gap, throughput (~791 tok/s), and gradient norms.
Parameters: 6.68M | lr: 1e-3 | batch: 16 | steps: 6000 - Achieved best validation loss of 4.1319 at step 3900

Co-authored-by: Max <eamon5174@gmail.com>

* feat(cuda): add attention forward and backward kernel declarations

Introduces the header declarations for `attention_forward` and
`attention_backward` operations inside the `quadtrix::cuda` namespace.
Configured with support for custom CUDA streams and head partitioning.

---------

Co-authored-by: Max <eamon5174@gmail.com>
---
 CUDA/includes/attention.cuh | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 CUDA/includes/attention.cuh

diff --git a/CUDA/includes/attention.cuh b/CUDA/includes/attention.cuh
new file mode 100644
index 0000000..7feac08
--- /dev/null
+++ b/CUDA/includes/attention.cuh
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "tensor.cuh"
+
+#include <cuda_runtime.h>
+
+namespace quadtrix {
+namespace cuda {
+
+Status attention_forward(
+    const TensorView& input_qkv,
+    TensorView preatt,
+    TensorView att,
+    TensorView output,
+    int num_heads,
+    cudaStream_t stream = nullptr);
+
+Status attention_backward(
+    const TensorView& grad_output,
+    const TensorView& input_qkv,
+    const TensorView& att,
+    TensorView grad_input_qkv,
+    TensorView grad_preatt,
+    TensorView grad_att,
+    int num_heads,
+    cudaStream_t stream = nullptr);
+
+}  // namespace cuda
+}  // namespace quadtrix

From 4aac832e725f1ec5b2136b3167bfa7028e714ee5 Mon Sep 17 00:00:00 2001
From: Eamon <eamon112009@gmail.com>
Date: Mon, 1 Jun 2026 22:30:58 +0530
Subject: [PATCH 2/6] feat(cuda): add checkpoint metadata struct and stub
 functions

---
 CUDA/includes/checkpoint.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 CUDA/includes/checkpoint.h

diff --git a/CUDA/includes/checkpoint.h b/CUDA/includes/checkpoint.h
new file mode 100644
index 0000000..ba91b0f
--- /dev/null
+++ b/CUDA/includes/checkpoint.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "tensor.cuh"
+
+namespace quadtrix {
+namespace cuda {
+
+struct CheckpointMetadata {
+    int vocab_size = 0;
+    int max_sequence_length = 0;
+    int num_layers = 0;
+    int num_heads = 0;
+    int channels = 0;
+};
+
+inline bool load_checkpoint_metadata(const char*, CheckpointMetadata*) {
+    return false;
+}
+
+inline bool save_tensor_checkpoint(const char*, const TensorView&) {
+    return false;
+}
+
+}  // namespace cuda
+}  // namespace quadtrix

From 47696058b34c95c45e715fb7b25dcec5a28ea955 Mon Sep 17 00:00:00 2001
From: Eamon <eamon112009@gmail.com>
Date: Mon, 1 Jun 2026 22:34:04 +0530
Subject: [PATCH 3/6] feat(cuda): introduce core type definitions and error
 handling utilities

- Defines `DType` and `DeviceKind` enums supporting standard types (F32, F16, BF16, I32, U8).
- Implements `dtype_name` and `dtype_size` metadata helper functions.
- Adds an explicit `Status` struct for non-throwing error propagation alongside `checked_mul` for safe allocation size computation.
- Introduces `check_cuda` and `abort_on_cuda` error macros and handling mechanisms, exposed via the `QUADTRIX_CUDA_CHECK` macro.
---
 CUDA/includes/common.h | 120 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 CUDA/includes/common.h

diff --git a/CUDA/includes/common.h b/CUDA/includes/common.h
new file mode 100644
index 0000000..36df155
--- /dev/null
+++ b/CUDA/includes/common.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <limits>
+
+namespace quadtrix {
+namespace cuda {
+
+enum class DType : std::uint8_t {
+    F32,
+    F16,
+    BF16,
+    I32,
+    U8,
+};
+
+enum class DeviceKind : std::uint8_t {
+    CPU,
+    CUDA,
+};
+
+struct Status {
+    bool ok;
+    cudaError_t cuda_error;
+    const char* message;
+
+    static Status success() {
+        return {true, cudaSuccess, "ok"};
+    }
+
+    static Status failure(cudaError_t error, const char* message) {
+        return {false, error, message};
+    }
+};
+
+inline const char* dtype_name(DType dtype) {
+    switch (dtype) {
+        case DType::F32:
+            return "f32";
+        case DType::F16:
+            return "f16";
+        case DType::BF16:
+            return "bf16";
+        case DType::I32:
+            return "i32";
+        case DType::U8:
+            return "u8";
+    }
+    return "unknown";
+}
+
+inline std::size_t dtype_size(DType dtype) {
+    switch (dtype) {
+        case DType::F32:
+            return 4;
+        case DType::F16:
+            return 2;
+        case DType::BF16:
+            return 2;
+        case DType::I32:
+            return 4;
+        case DType::U8:
+            return 1;
+    }
+
+    std::fprintf(stderr, "Unknown CUDA dtype value %u\n", static_cast<unsigned int>(dtype));
+    std::abort();
+}
+
+inline bool checked_mul(std::size_t lhs, std::size_t rhs, std::size_t* out) {
+    if (lhs != 0 && rhs > std::numeric_limits<std::size_t>::max() / lhs) {
+        return false;
+    }
+    *out = lhs * rhs;
+    return true;
+}
+
+inline Status check_cuda(cudaError_t error, const char* expression, const char* file, int line) {
+    if (error == cudaSuccess) {
+        return Status::success();
+    }
+
+    std::fprintf(
+        stderr,
+        "CUDA error at %s:%d: %s failed with %s\n",
+        file,
+        line,
+        expression,
+        cudaGetErrorString(error));
+    return Status::failure(error, expression);
+}
+
+inline void abort_on_cuda(cudaError_t error, const char* expression, const char* file, int line) {
+    if (error == cudaSuccess) {
+        return;
+    }
+
+    std::fprintf(
+        stderr,
+        "Fatal CUDA error at %s:%d: %s failed with %s\n",
+        file,
+        line,
+        expression,
+        cudaGetErrorString(error));
+    std::abort();
+}
+
+}  // namespace cuda
+}  // namespace quadtrix
+
+#define QUADTRIX_CUDA_CHECK(expr) \
+    ::quadtrix::cuda::check_cuda((expr), #expr, __FILE__, __LINE__)
+
+#define QUADTRIX_CUDA_ABORT(expr) \
+    ::quadtrix::cuda::abort_on_cuda((expr), #expr, __FILE__, __LINE__)

From 7c94958781dddc8d38a30d34dd343a00417c7fc7 Mon Sep 17 00:00:00 2001
From: Eamon <eamon112009@gmail.com>
Date: Mon, 1 Jun 2026 22:34:39 +0530
Subject: [PATCH 4/6] feat(cuda): add TokenBatchView struct and DataLoader stub
 class

---
 CUDA/includes/dataloader.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 CUDA/includes/dataloader.h

diff --git a/CUDA/includes/dataloader.h b/CUDA/includes/dataloader.h
new file mode 100644
index 0000000..fd3c47d
--- /dev/null
+++ b/CUDA/includes/dataloader.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace quadtrix {
+namespace cuda {
+
+struct TokenBatchView {
+    const std::int32_t* inputs = nullptr;
+    const std::int32_t* targets = nullptr;
+    int batch_size = 0;
+    int sequence_length = 0;
+};
+
+class DataLoader {
+public:
+    DataLoader() = default;
+
+    bool next(TokenBatchView* batch) {
+        if (batch != nullptr) {
+            *batch = {};
+        }
+        return false;
+    }
+};
+
+}  // namespace cuda
+}  // namespace quadtrix

From c62c869527bcf83ab494341b1667b7ac95e9af95 Mon Sep 17 00:00:00 2001
From: Eamon <eamon112009@gmail.com>
Date: Mon, 1 Jun 2026 22:35:34 +0530
Subject: [PATCH 5/6] feat(cuda): add GeLU activation forward and backward
 declarations

- Introduces the `GeluMode` enum to toggle between `Exact` and `Approximate` mathematical variants.
- Declares the `gelu_forward` and `gelu_backward` kernel entrypoints.
- Configures both signatures with optional stream execution and a default mode of `GeluMode::Approximate`.
---
 CUDA/includes/gelu.cuh | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 CUDA/includes/gelu.cuh

diff --git a/CUDA/includes/gelu.cuh b/CUDA/includes/gelu.cuh
new file mode 100644
index 0000000..af87e64
--- /dev/null
+++ b/CUDA/includes/gelu.cuh
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "tensor.cuh"
+
+#include <cuda_runtime.h>
+
+#include <cstdint>
+
+namespace quadtrix {
+namespace cuda {
+
+enum class GeluMode : std::uint8_t {
+    Exact,
+    Approximate,
+};
+
+Status gelu_forward(
+    const TensorView& input,
+    TensorView output,
+    GeluMode mode = GeluMode::Approximate,
+    cudaStream_t stream = nullptr);
+
+Status gelu_backward(
+    const TensorView& grad_output,
+    const TensorView& input,
+    TensorView grad_input,
+    GeluMode mode = GeluMode::Approximate,
+    cudaStream_t stream = nullptr);
+
+}  // namespace cuda
+}  // namespace quadtrix

From 28117dc6f6e5bb2be6544f0a9007043a943686c1 Mon Sep 17 00:00:00 2001
From: Eamon <eamon112009@gmail.com>
Date: Mon, 1 Jun 2026 22:47:36 +0530
Subject: [PATCH 6/6] feat(cuda): add gradient norm calculation and clipping
 interfaces

---
 CUDA/includes/global_norm.cuh | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 CUDA/includes/global_norm.cuh

diff --git a/CUDA/includes/global_norm.cuh b/CUDA/includes/global_norm.cuh
new file mode 100644
index 0000000..f418ab7
--- /dev/null
+++ b/CUDA/includes/global_norm.cuh
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "tensor.cuh"
+
+#include <cuda_runtime.h>
+
+namespace quadtrix {
+namespace cuda {
+
+Status global_norm_squared(
+    const TensorView& grads,
+    TensorView partial_sums,
+    cudaStream_t stream = nullptr);
+
+Status clip_gradients_by_global_norm(
+    TensorView grads,
+    float global_norm,
+    float max_norm,
+    cudaStream_t stream = nullptr);
+
+inline float clip_scale(float global_norm, float max_norm) {
+    return global_norm > max_norm && global_norm > 0.0f ? max_norm / global_norm : 1.0f;
+}
+
+}  // namespace cuda
+}  // namespace quadtrix