NripeshN · NripeshN · Jan 26, 2026 · Jan 24, 2026 · Jan 24, 2026 · Jan 24, 2026
@@ -114,6 +114,11 @@ endif()
 # CUDA Configuration
 # ============================================================================
 if(USE_CUDA)
+  # CUDA optimization options
+  option(CUDA_TENSOR_CORES "Enable tensor core kernels (Volta SM 7.0+)" ON)
+  option(CUDA_PROFILING "Enable NVTX profiling markers" OFF)
+  option(CUDA_WARP_PRIMITIVES "Enable warp-level primitive optimizations" ON)
+
   # Find CUDA toolkit
   find_package(CUDAToolkit QUIET)
 
@@ -164,6 +169,24 @@ if(USE_CUDA)
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 --use_fast_math")
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fPIC")
+
+    # Enable tensor core support
+    if(CUDA_TENSOR_CORES)
+      add_definitions(-DUSE_CUDA_TENSOR_CORES)
+      message(STATUS "  Tensor Cores: ENABLED")
+    endif()
+
+    # Enable NVTX profiling
+    if(CUDA_PROFILING)
+      add_definitions(-DUSE_NVTX)
+      message(STATUS "  NVTX Profiling: ENABLED")
+    endif()
+
+    # Enable warp primitives
+    if(CUDA_WARP_PRIMITIVES)
+      add_definitions(-DUSE_CUDA_WARP_PRIMITIVES)
+      message(STATUS "  Warp Primitives: ENABLED")
+    endif()
 
     # Enable separable compilation for device code
     set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
@@ -221,6 +244,28 @@ set(GPU_SOURCES src/gpu/gpu_nnue_integration.cpp src/gpu/gpu_mcts_backend.cpp
 
 # CUDA source files
 set(CUDA_SOURCES "")
+if(USE_CUDA AND CUDA_AVAILABLE)
+  set(CUDA_SOURCES 
+      src/gpu/cuda/cuda_backend.cu
+      src/gpu/cuda/cuda_memory.cu
+      src/gpu/cuda/kernels/nnue_kernels.cu)
+
+  # Add advanced optimization kernels if enabled
+  if(CUDA_WARP_PRIMITIVES)
+    list(APPEND CUDA_SOURCES src/gpu/cuda/kernels/nnue_simd.cu)
+  endif()
+
+  if(CUDA_TENSOR_CORES)
+    list(APPEND CUDA_SOURCES src/gpu/cuda/kernels/nnue_tensor_core.cu)
+  endif()
+
+  # Add advanced features
+  list(APPEND CUDA_SOURCES 
+       src/gpu/cuda/cuda_graphs.cu
+       src/gpu/cuda/cuda_multi_gpu.cu
+       src/gpu/cuda/cuda_fp16_weights.cu
+       src/gpu/cuda/kernels/nnue_persistent.cu)
+endif()
 
 # MCTS source files (hybrid search) Core files needed for all MCTS modes: -
 # stockfish_adapter: Interface between Stockfish and MCTS - position_classifier:
@@ -265,10 +310,9 @@ if(USE_METAL AND METAL_CPP_AVAILABLE)
   endif()
 elseif(USE_CUDA AND CUDA_AVAILABLE)
   # CUDA GPU acceleration
-  set(CUDA_SOURCES src/gpu/cuda/cuda_backend.cu
-                   src/gpu/cuda/kernels/nnue_kernels.cu)
   add_definitions(-DUSE_CUDA)
   message(STATUS "CUDA GPU acceleration: ENABLED")
+  message(STATUS "  CUDA source files: ${CUDA_SOURCES}")
 else()
   # CPU fallback backend
   set(GPU_SOURCES ${GPU_SOURCES} src/gpu/cpu_backend.cpp)

@@ -15,6 +15,8 @@
 #ifdef USE_CUDA
 
 #include "cuda_backend.h"
+#include "cuda_memory.h"
+#include "cuda_profiling.h"
 #include <atomic>
 #include <cstring>
 #include <cuda_runtime.h>
@@ -88,11 +90,11 @@ CUDABuffer::CUDABuffer(void *device_ptr, void *host_ptr, size_t size,
 CUDABuffer::~CUDABuffer() {
   if (device_ptr_) {
     if (unified_) {
-      cudaFree(device_ptr_);
+      CUDA::UnifiedMemoryManager::free_unified(device_ptr_);
     } else {
       cudaFree(device_ptr_);
       if (host_ptr_) {
-        cudaFreeHost(host_ptr_);
+        CUDA::PinnedMemoryManager::free_pinned(host_ptr_);
       }
     }
   }
@@ -263,9 +265,11 @@ void CUDACommandEncoder::barrier() { cudaStreamSynchronize(stream_); }
 CUDABackend::CUDABackend()
     : device_id_(-1), compute_capability_major_(0),
       compute_capability_minor_(0), total_memory_(0), multiprocessor_count_(0),
-      unified_memory_supported_(false), default_stream_(nullptr),
+      unified_memory_supported_(false), tensor_cores_available_(false),
+      int8_tensor_cores_available_(false), default_stream_(nullptr),
       stream_index_(0), allocated_memory_(0), peak_memory_(0),
-      initialized_(false) {}
+      initialized_(false), use_cuda_graphs_(false), use_multi_gpu_(false),
+      use_persistent_kernels_(false), use_fp16_weights_(false) {}
 
 CUDABackend::~CUDABackend() { cleanup(); }
 
@@ -333,6 +337,9 @@ bool CUDABackend::initialize() {
     cudaStreamCreate(&parallel_streams_[i]);
   }
 
+  // Detect architecture-specific features
+  detect_architecture_features();
+
   initialized_ = true;
 
   std::cout << "[CUDA Backend] Initialized: " << device_name_ << std::endl;
@@ -345,10 +352,51 @@ bool CUDABackend::initialize() {
             << std::endl;
   std::cout << "[CUDA Backend] Unified Memory: "
             << (unified_memory_supported_ ? "Yes" : "No") << std::endl;
+  std::cout << "[CUDA Backend] Tensor Cores: "
+            << (tensor_cores_available_ ? "Yes" : "No") << std::endl;
+  if (tensor_cores_available_) {
+    std::cout << "[CUDA Backend] INT8 Tensor Cores: "
+              << (int8_tensor_cores_available_ ? "Yes" : "No") << std::endl;
+  }
 
   return true;
 }
 
+void CUDABackend::detect_architecture_features() {
+  // Detect tensor core support
+  // Volta (SM 7.0) and later have FP16 tensor cores
+  tensor_cores_available_ = compute_capability_major_ >= 7;
+
+  // Turing (SM 7.5) and later have INT8 tensor cores
+  this->int8_tensor_cores_available_ = (compute_capability_major_ > 7) || 
+                                        (compute_capability_major_ == 7 && 
+                                         compute_capability_minor_ >= 5);
+
+  // Print architecture-specific information
+  std::string arch_name;
+  if (compute_capability_major_ == 6 && compute_capability_minor_ == 0) {
+    arch_name = "Pascal (GP100)";
+  } else if (compute_capability_major_ == 6 && compute_capability_minor_ == 1) {
+    arch_name = "Pascal (GP10x)";
+  } else if (compute_capability_major_ == 7 && compute_capability_minor_ == 0) {
+    arch_name = "Volta";
+  } else if (compute_capability_major_ == 7 && compute_capability_minor_ == 5) {
+    arch_name = "Turing";
+  } else if (compute_capability_major_ == 8 && compute_capability_minor_ == 0) {
+    arch_name = "Ampere (A100)";
+  } else if (compute_capability_major_ == 8 && compute_capability_minor_ == 6) {
+    arch_name = "Ampere (GA10x)";
+  } else if (compute_capability_major_ == 8 && compute_capability_minor_ == 9) {
+    arch_name = "Ada Lovelace";
+  } else if (compute_capability_major_ == 9 && compute_capability_minor_ == 0) {
+    arch_name = "Hopper";
+  } else {
+    arch_name = "Unknown";
+  }
+
+  std::cout << "[CUDA Backend] Architecture: " << arch_name << std::endl;
+}
+
 void CUDABackend::cleanup() {
   if (!initialized_) {
     return;
@@ -462,11 +510,17 @@ std::unique_ptr<Buffer> CUDABackend::create_buffer(size_t size, MemoryMode mode,
   bool unified = false;
 
   if (mode == MemoryMode::Shared && unified_memory_supported_) {
-    // Use unified memory
-    cudaError_t err = cudaMallocManaged(&device_ptr, size);
-    if (err != cudaSuccess) {
+    // Use optimized unified memory with hints
+    device_ptr = CUDA::UnifiedMemoryManager::allocate_unified(size, device_id_);
+    if (!device_ptr) {
       return nullptr;
     }
+
+    // For read-only buffers (like weights), use read-mostly hint
+    if (usage == BufferUsage::Static) {
+      cudaMemAdvise(device_ptr, size, cudaMemAdviseSetReadMostly, device_id_);
+    }
+
     unified = true;
   } else {
     // Allocate device and host memory separately
@@ -476,8 +530,9 @@ std::unique_ptr<Buffer> CUDABackend::create_buffer(size_t size, MemoryMode mode,
     }
 
     if (mode != MemoryMode::Private) {
-      err = cudaMallocHost(&host_ptr, size);
-      if (err != cudaSuccess) {
+      // Use pinned memory for faster transfers
+      host_ptr = CUDA::PinnedMemoryManager::allocate_pinned(size);
+      if (!host_ptr) {
         cudaFree(device_ptr);
         return nullptr;
       }

@@ -169,13 +169,31 @@ class CUDABackend : public Backend {
   int compute_capability_minor() const { return compute_capability_minor_; }
   size_t total_memory() const { return total_memory_; }
   int multiprocessor_count() const { return multiprocessor_count_; }
+  bool has_tensor_cores() const { return tensor_cores_available_; }
+  bool has_int8_tensor_cores() const { return int8_tensor_cores_available_; }
+  bool has_warp_shuffle() const { return compute_capability_major_ >= 3; }
+  bool has_cooperative_groups() const { return compute_capability_major_ >= 6; }
+
+  // Advanced feature support
+  void enable_cuda_graphs(bool enable) { use_cuda_graphs_ = enable; }
+  bool is_cuda_graphs_enabled() const { return use_cuda_graphs_; }
+
+  void enable_multi_gpu(bool enable) { use_multi_gpu_ = enable; }
+  bool is_multi_gpu_enabled() const { return use_multi_gpu_; }
+
+  void enable_persistent_kernels(bool enable) { use_persistent_kernels_ = enable; }
+  bool is_persistent_kernels_enabled() const { return use_persistent_kernels_; }
+
+  void enable_fp16_weights(bool enable) { use_fp16_weights_ = enable; }
+  bool is_fp16_weights_enabled() const { return use_fp16_weights_; }
 
 private:
   CUDABackend();
   ~CUDABackend();
 
   bool initialize();
   void cleanup();
+  void detect_architecture_features();
 
   int device_id_;
   std::string device_name_;
@@ -184,6 +202,14 @@ class CUDABackend : public Backend {
   size_t total_memory_;
   int multiprocessor_count_;
   bool unified_memory_supported_;
+  bool tensor_cores_available_;
+  bool int8_tensor_cores_available_;
+
+  // Feature flags
+  bool use_cuda_graphs_;
+  bool use_multi_gpu_;
+  bool use_persistent_kernels_;
+  bool use_fp16_weights_;
 
   cudaStream_t default_stream_;
   std::vector<cudaStream_t> parallel_streams_;

@@ -0,0 +1,125 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  FP16 Weight Storage Implementation
+*/
+
+#ifdef USE_CUDA
+
+#include "cuda_fp16_weights.h"
+#include <iostream>
+#include <unordered_map>
+#include <string>
+
+namespace MetalFish {
+namespace GPU {
+namespace CUDA {
+
+FP16WeightManager::~FP16WeightManager() {
+  clear_all();
+}
+
+half* FP16WeightManager::convert_and_store_weights(
+    const int16_t* int16_weights, size_t size, float scale) {
+
+  // Allocate host memory for FP16 conversion
+  std::vector<half> fp16_host(size);
+
+  // Convert INT16 to FP16
+  for (size_t i = 0; i < size; i++) {
+    float val = static_cast<float>(int16_weights[i]) / scale;
+    fp16_host[i] = __float2half(val);
+  }
+
+  // Allocate device memory
+  half* device_ptr = nullptr;
+  cudaError_t err = cudaMalloc(&device_ptr, size * sizeof(half));
+  if (err != cudaSuccess) {
+    std::cerr << "[FP16 Weights] Failed to allocate device memory: "
+              << cudaGetErrorString(err) << std::endl;
+    return nullptr;
+  }
+
+  // Copy to device
+  err = cudaMemcpy(device_ptr, fp16_host.data(), size * sizeof(half),
+                   cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    std::cerr << "[FP16 Weights] Failed to copy to device: "
+              << cudaGetErrorString(err) << std::endl;
+    cudaFree(device_ptr);
+    return nullptr;
+  }
+
+  total_memory_ += size * sizeof(half);
+  return device_ptr;
+}
+
+half* FP16WeightManager::convert_and_store_biases(
+    const int32_t* int32_biases, size_t size, float scale) {
+
+  // Allocate host memory for FP16 conversion
+  std::vector<half> fp16_host(size);
+
+  // Convert INT32 to FP16
+  for (size_t i = 0; i < size; i++) {
+    float val = static_cast<float>(int32_biases[i]) / scale;
+    fp16_host[i] = __float2half(val);
+  }
+
+  // Allocate device memory
+  half* device_ptr = nullptr;
+  cudaError_t err = cudaMalloc(&device_ptr, size * sizeof(half));
+  if (err != cudaSuccess) {
+    std::cerr << "[FP16 Biases] Failed to allocate device memory: "
+              << cudaGetErrorString(err) << std::endl;
+    return nullptr;
+  }
+
+  // Copy to device
+  err = cudaMemcpy(device_ptr, fp16_host.data(), size * sizeof(half),
+                   cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    std::cerr << "[FP16 Biases] Failed to copy to device: "
+              << cudaGetErrorString(err) << std::endl;
+    cudaFree(device_ptr);
+    return nullptr;
+  }
+
+  total_memory_ += size * sizeof(half);
+  return device_ptr;
+}
+
+half* FP16WeightManager::get_fp16_weights(const std::string& layer_name) {
+  auto it = weights_.find(layer_name);
+  return (it != weights_.end()) ? it->second.device_ptr : nullptr;
+}
+
+half* FP16WeightManager::get_fp16_biases(const std::string& layer_name) {
+  auto it = biases_.find(layer_name);
+  return (it != biases_.end()) ? it->second.device_ptr : nullptr;
+}
+
+void FP16WeightManager::clear_all() {
+  for (auto& [name, data] : weights_) {
+    if (data.device_ptr) {
+      cudaFree(data.device_ptr);
+    }
+  }
+
+  for (auto& [name, data] : biases_) {
+    if (data.device_ptr) {
+      cudaFree(data.device_ptr);
+    }
+  }
+
+  weights_.clear();
+  biases_.clear();
+  total_memory_ = 0;
+}
+
+} // namespace CUDA
+} // namespace GPU
+} // namespace MetalFish
+
+#endif // USE_CUDA