diff --git a/CMakeLists.txt b/CMakeLists.txt
index 006092d0..374d84c8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -114,6 +114,11 @@ endif()
 # CUDA Configuration
 # ============================================================================
 if(USE_CUDA)
+  # CUDA optimization options
+  option(CUDA_TENSOR_CORES "Enable tensor core kernels (Volta SM 7.0+)" ON)
+  option(CUDA_PROFILING "Enable NVTX profiling markers" OFF)
+  option(CUDA_WARP_PRIMITIVES "Enable warp-level primitive optimizations" ON)
+  
   # Find CUDA toolkit
   find_package(CUDAToolkit QUIET)
 
@@ -164,6 +169,24 @@ if(USE_CUDA)
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 --use_fast_math")
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fPIC")
+    
+    # Enable tensor core support
+    if(CUDA_TENSOR_CORES)
+      add_definitions(-DUSE_CUDA_TENSOR_CORES)
+      message(STATUS "  Tensor Cores: ENABLED")
+    endif()
+    
+    # Enable NVTX profiling
+    if(CUDA_PROFILING)
+      add_definitions(-DUSE_NVTX)
+      message(STATUS "  NVTX Profiling: ENABLED")
+    endif()
+    
+    # Enable warp primitives
+    if(CUDA_WARP_PRIMITIVES)
+      add_definitions(-DUSE_CUDA_WARP_PRIMITIVES)
+      message(STATUS "  Warp Primitives: ENABLED")
+    endif()
 
     # Enable separable compilation for device code
     set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
@@ -221,6 +244,28 @@ set(GPU_SOURCES src/gpu/gpu_nnue_integration.cpp src/gpu/gpu_mcts_backend.cpp
 
 # CUDA source files
 set(CUDA_SOURCES "")
+if(USE_CUDA AND CUDA_AVAILABLE)
+  set(CUDA_SOURCES 
+      src/gpu/cuda/cuda_backend.cu
+      src/gpu/cuda/cuda_memory.cu
+      src/gpu/cuda/kernels/nnue_kernels.cu)
+  
+  # Add advanced optimization kernels if enabled
+  if(CUDA_WARP_PRIMITIVES)
+    list(APPEND CUDA_SOURCES src/gpu/cuda/kernels/nnue_simd.cu)
+  endif()
+  
+  if(CUDA_TENSOR_CORES)
+    list(APPEND CUDA_SOURCES src/gpu/cuda/kernels/nnue_tensor_core.cu)
+  endif()
+  
+  # Add advanced features
+  list(APPEND CUDA_SOURCES 
+       src/gpu/cuda/cuda_graphs.cu
+       src/gpu/cuda/cuda_multi_gpu.cu
+       src/gpu/cuda/cuda_fp16_weights.cu
+       src/gpu/cuda/kernels/nnue_persistent.cu)
+endif()
 
 # MCTS source files (hybrid search) Core files needed for all MCTS modes: -
 # stockfish_adapter: Interface between Stockfish and MCTS - position_classifier:
@@ -265,10 +310,9 @@ if(USE_METAL AND METAL_CPP_AVAILABLE)
   endif()
 elseif(USE_CUDA AND CUDA_AVAILABLE)
   # CUDA GPU acceleration
-  set(CUDA_SOURCES src/gpu/cuda/cuda_backend.cu
-                   src/gpu/cuda/kernels/nnue_kernels.cu)
   add_definitions(-DUSE_CUDA)
   message(STATUS "CUDA GPU acceleration: ENABLED")
+  message(STATUS "  CUDA source files: ${CUDA_SOURCES}")
 else()
   # CPU fallback backend
   set(GPU_SOURCES ${GPU_SOURCES} src/gpu/cpu_backend.cpp)
diff --git a/src/gpu/cuda/cuda_backend.cu b/src/gpu/cuda/cuda_backend.cu
index f30374b4..b2e60310 100644
--- a/src/gpu/cuda/cuda_backend.cu
+++ b/src/gpu/cuda/cuda_backend.cu
@@ -15,6 +15,8 @@
 #ifdef USE_CUDA
 
 #include "cuda_backend.h"
+#include "cuda_memory.h"
+#include "cuda_profiling.h"
 #include <atomic>
 #include <cstring>
 #include <cuda_runtime.h>
@@ -88,11 +90,11 @@ CUDABuffer::CUDABuffer(void *device_ptr, void *host_ptr, size_t size,
 CUDABuffer::~CUDABuffer() {
   if (device_ptr_) {
     if (unified_) {
-      cudaFree(device_ptr_);
+      CUDA::UnifiedMemoryManager::free_unified(device_ptr_);
     } else {
       cudaFree(device_ptr_);
       if (host_ptr_) {
-        cudaFreeHost(host_ptr_);
+        CUDA::PinnedMemoryManager::free_pinned(host_ptr_);
       }
     }
   }
@@ -263,9 +265,11 @@ void CUDACommandEncoder::barrier() { cudaStreamSynchronize(stream_); }
 CUDABackend::CUDABackend()
     : device_id_(-1), compute_capability_major_(0),
       compute_capability_minor_(0), total_memory_(0), multiprocessor_count_(0),
-      unified_memory_supported_(false), default_stream_(nullptr),
+      unified_memory_supported_(false), tensor_cores_available_(false),
+      int8_tensor_cores_available_(false), default_stream_(nullptr),
       stream_index_(0), allocated_memory_(0), peak_memory_(0),
-      initialized_(false) {}
+      initialized_(false), use_cuda_graphs_(false), use_multi_gpu_(false),
+      use_persistent_kernels_(false), use_fp16_weights_(false) {}
 
 CUDABackend::~CUDABackend() { cleanup(); }
 
@@ -333,6 +337,9 @@ bool CUDABackend::initialize() {
     cudaStreamCreate(&parallel_streams_[i]);
   }
 
+  // Detect architecture-specific features
+  detect_architecture_features();
+  
   initialized_ = true;
 
   std::cout << "[CUDA Backend] Initialized: " << device_name_ << std::endl;
@@ -345,10 +352,51 @@ bool CUDABackend::initialize() {
             << std::endl;
   std::cout << "[CUDA Backend] Unified Memory: "
             << (unified_memory_supported_ ? "Yes" : "No") << std::endl;
+  std::cout << "[CUDA Backend] Tensor Cores: "
+            << (tensor_cores_available_ ? "Yes" : "No") << std::endl;
+  if (tensor_cores_available_) {
+    std::cout << "[CUDA Backend] INT8 Tensor Cores: "
+              << (int8_tensor_cores_available_ ? "Yes" : "No") << std::endl;
+  }
 
   return true;
 }
 
+void CUDABackend::detect_architecture_features() {
+  // Detect tensor core support
+  // Volta (SM 7.0) and later have FP16 tensor cores
+  tensor_cores_available_ = compute_capability_major_ >= 7;
+  
+  // Turing (SM 7.5) and later have INT8 tensor cores
+  this->int8_tensor_cores_available_ = (compute_capability_major_ > 7) || 
+                                        (compute_capability_major_ == 7 && 
+                                         compute_capability_minor_ >= 5);
+  
+  // Print architecture-specific information
+  std::string arch_name;
+  if (compute_capability_major_ == 6 && compute_capability_minor_ == 0) {
+    arch_name = "Pascal (GP100)";
+  } else if (compute_capability_major_ == 6 && compute_capability_minor_ == 1) {
+    arch_name = "Pascal (GP10x)";
+  } else if (compute_capability_major_ == 7 && compute_capability_minor_ == 0) {
+    arch_name = "Volta";
+  } else if (compute_capability_major_ == 7 && compute_capability_minor_ == 5) {
+    arch_name = "Turing";
+  } else if (compute_capability_major_ == 8 && compute_capability_minor_ == 0) {
+    arch_name = "Ampere (A100)";
+  } else if (compute_capability_major_ == 8 && compute_capability_minor_ == 6) {
+    arch_name = "Ampere (GA10x)";
+  } else if (compute_capability_major_ == 8 && compute_capability_minor_ == 9) {
+    arch_name = "Ada Lovelace";
+  } else if (compute_capability_major_ == 9 && compute_capability_minor_ == 0) {
+    arch_name = "Hopper";
+  } else {
+    arch_name = "Unknown";
+  }
+  
+  std::cout << "[CUDA Backend] Architecture: " << arch_name << std::endl;
+}
+
 void CUDABackend::cleanup() {
   if (!initialized_) {
     return;
@@ -462,11 +510,17 @@ std::unique_ptr<Buffer> CUDABackend::create_buffer(size_t size, MemoryMode mode,
   bool unified = false;
 
   if (mode == MemoryMode::Shared && unified_memory_supported_) {
-    // Use unified memory
-    cudaError_t err = cudaMallocManaged(&device_ptr, size);
-    if (err != cudaSuccess) {
+    // Use optimized unified memory with hints
+    device_ptr = CUDA::UnifiedMemoryManager::allocate_unified(size, device_id_);
+    if (!device_ptr) {
       return nullptr;
     }
+    
+    // For read-only buffers (like weights), use read-mostly hint
+    if (usage == BufferUsage::Static) {
+      cudaMemAdvise(device_ptr, size, cudaMemAdviseSetReadMostly, device_id_);
+    }
+    
     unified = true;
   } else {
     // Allocate device and host memory separately
@@ -476,8 +530,9 @@ std::unique_ptr<Buffer> CUDABackend::create_buffer(size_t size, MemoryMode mode,
     }
 
     if (mode != MemoryMode::Private) {
-      err = cudaMallocHost(&host_ptr, size);
-      if (err != cudaSuccess) {
+      // Use pinned memory for faster transfers
+      host_ptr = CUDA::PinnedMemoryManager::allocate_pinned(size);
+      if (!host_ptr) {
         cudaFree(device_ptr);
         return nullptr;
       }
diff --git a/src/gpu/cuda/cuda_backend.h b/src/gpu/cuda/cuda_backend.h
index a7036018..2760100d 100644
--- a/src/gpu/cuda/cuda_backend.h
+++ b/src/gpu/cuda/cuda_backend.h
@@ -169,6 +169,23 @@ class CUDABackend : public Backend {
   int compute_capability_minor() const { return compute_capability_minor_; }
   size_t total_memory() const { return total_memory_; }
   int multiprocessor_count() const { return multiprocessor_count_; }
+  bool has_tensor_cores() const { return tensor_cores_available_; }
+  bool has_int8_tensor_cores() const { return int8_tensor_cores_available_; }
+  bool has_warp_shuffle() const { return compute_capability_major_ >= 3; }
+  bool has_cooperative_groups() const { return compute_capability_major_ >= 6; }
+
+  // Advanced feature support
+  void enable_cuda_graphs(bool enable) { use_cuda_graphs_ = enable; }
+  bool is_cuda_graphs_enabled() const { return use_cuda_graphs_; }
+  
+  void enable_multi_gpu(bool enable) { use_multi_gpu_ = enable; }
+  bool is_multi_gpu_enabled() const { return use_multi_gpu_; }
+  
+  void enable_persistent_kernels(bool enable) { use_persistent_kernels_ = enable; }
+  bool is_persistent_kernels_enabled() const { return use_persistent_kernels_; }
+  
+  void enable_fp16_weights(bool enable) { use_fp16_weights_ = enable; }
+  bool is_fp16_weights_enabled() const { return use_fp16_weights_; }
 
 private:
   CUDABackend();
@@ -176,6 +193,7 @@ class CUDABackend : public Backend {
 
   bool initialize();
   void cleanup();
+  void detect_architecture_features();
 
   int device_id_;
   std::string device_name_;
@@ -184,6 +202,14 @@ class CUDABackend : public Backend {
   size_t total_memory_;
   int multiprocessor_count_;
   bool unified_memory_supported_;
+  bool tensor_cores_available_;
+  bool int8_tensor_cores_available_;
+
+  // Feature flags
+  bool use_cuda_graphs_;
+  bool use_multi_gpu_;
+  bool use_persistent_kernels_;
+  bool use_fp16_weights_;
 
   cudaStream_t default_stream_;
   std::vector<cudaStream_t> parallel_streams_;
diff --git a/src/gpu/cuda/cuda_fp16_weights.cu b/src/gpu/cuda/cuda_fp16_weights.cu
new file mode 100644
index 00000000..cfd8d64e
--- /dev/null
+++ b/src/gpu/cuda/cuda_fp16_weights.cu
@@ -0,0 +1,125 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  FP16 Weight Storage Implementation
+*/
+
+#ifdef USE_CUDA
+
+#include "cuda_fp16_weights.h"
+#include <iostream>
+#include <unordered_map>
+#include <string>
+
+namespace MetalFish {
+namespace GPU {
+namespace CUDA {
+
+FP16WeightManager::~FP16WeightManager() {
+  clear_all();
+}
+
+half* FP16WeightManager::convert_and_store_weights(
+    const int16_t* int16_weights, size_t size, float scale) {
+  
+  // Allocate host memory for FP16 conversion
+  std::vector<half> fp16_host(size);
+  
+  // Convert INT16 to FP16
+  for (size_t i = 0; i < size; i++) {
+    float val = static_cast<float>(int16_weights[i]) / scale;
+    fp16_host[i] = __float2half(val);
+  }
+  
+  // Allocate device memory
+  half* device_ptr = nullptr;
+  cudaError_t err = cudaMalloc(&device_ptr, size * sizeof(half));
+  if (err != cudaSuccess) {
+    std::cerr << "[FP16 Weights] Failed to allocate device memory: "
+              << cudaGetErrorString(err) << std::endl;
+    return nullptr;
+  }
+  
+  // Copy to device
+  err = cudaMemcpy(device_ptr, fp16_host.data(), size * sizeof(half),
+                   cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    std::cerr << "[FP16 Weights] Failed to copy to device: "
+              << cudaGetErrorString(err) << std::endl;
+    cudaFree(device_ptr);
+    return nullptr;
+  }
+  
+  total_memory_ += size * sizeof(half);
+  return device_ptr;
+}
+
+half* FP16WeightManager::convert_and_store_biases(
+    const int32_t* int32_biases, size_t size, float scale) {
+  
+  // Allocate host memory for FP16 conversion
+  std::vector<half> fp16_host(size);
+  
+  // Convert INT32 to FP16
+  for (size_t i = 0; i < size; i++) {
+    float val = static_cast<float>(int32_biases[i]) / scale;
+    fp16_host[i] = __float2half(val);
+  }
+  
+  // Allocate device memory
+  half* device_ptr = nullptr;
+  cudaError_t err = cudaMalloc(&device_ptr, size * sizeof(half));
+  if (err != cudaSuccess) {
+    std::cerr << "[FP16 Biases] Failed to allocate device memory: "
+              << cudaGetErrorString(err) << std::endl;
+    return nullptr;
+  }
+  
+  // Copy to device
+  err = cudaMemcpy(device_ptr, fp16_host.data(), size * sizeof(half),
+                   cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    std::cerr << "[FP16 Biases] Failed to copy to device: "
+              << cudaGetErrorString(err) << std::endl;
+    cudaFree(device_ptr);
+    return nullptr;
+  }
+  
+  total_memory_ += size * sizeof(half);
+  return device_ptr;
+}
+
+half* FP16WeightManager::get_fp16_weights(const std::string& layer_name) {
+  auto it = weights_.find(layer_name);
+  return (it != weights_.end()) ? it->second.device_ptr : nullptr;
+}
+
+half* FP16WeightManager::get_fp16_biases(const std::string& layer_name) {
+  auto it = biases_.find(layer_name);
+  return (it != biases_.end()) ? it->second.device_ptr : nullptr;
+}
+
+void FP16WeightManager::clear_all() {
+  for (auto& [name, data] : weights_) {
+    if (data.device_ptr) {
+      cudaFree(data.device_ptr);
+    }
+  }
+  
+  for (auto& [name, data] : biases_) {
+    if (data.device_ptr) {
+      cudaFree(data.device_ptr);
+    }
+  }
+  
+  weights_.clear();
+  biases_.clear();
+  total_memory_ = 0;
+}
+
+} // namespace CUDA
+} // namespace GPU
+} // namespace MetalFish
+
+#endif // USE_CUDA
diff --git a/src/gpu/cuda/cuda_fp16_weights.h b/src/gpu/cuda/cuda_fp16_weights.h
new file mode 100644
index 00000000..8daac2d3
--- /dev/null
+++ b/src/gpu/cuda/cuda_fp16_weights.h
@@ -0,0 +1,93 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  FP16 Weight Storage
+
+  Provides FP16 weight storage and conversion for tensor core compatibility.
+*/
+
+#ifndef CUDA_FP16_WEIGHTS_H
+#define CUDA_FP16_WEIGHTS_H
+
+#ifdef USE_CUDA
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+namespace MetalFish {
+namespace GPU {
+namespace CUDA {
+
+/**
+ * FP16 Weight Manager
+ * 
+ * Manages conversion and storage of network weights in FP16 format
+ * for tensor core acceleration.
+ */
+class FP16WeightManager {
+public:
+  FP16WeightManager() = default;
+  ~FP16WeightManager();
+
+  /**
+   * Convert and store weights in FP16 format
+   * @param int16_weights Original INT16 weights
+   * @param size Number of weight elements
+   * @param scale Scale factor for conversion
+   * @return Device pointer to FP16 weights
+   */
+  half* convert_and_store_weights(const int16_t* int16_weights, 
+                                   size_t size, float scale = 64.0f);
+
+  /**
+   * Convert and store biases in FP16 format
+   * @param int32_biases Original INT32 biases
+   * @param size Number of bias elements
+   * @param scale Scale factor for conversion
+   * @return Device pointer to FP16 biases
+   */
+  half* convert_and_store_biases(const int32_t* int32_biases,
+                                  size_t size, float scale = 64.0f);
+
+  /**
+   * Get FP16 weights for a layer
+   */
+  half* get_fp16_weights(const std::string& layer_name);
+
+  /**
+   * Get FP16 biases for a layer
+   */
+  half* get_fp16_biases(const std::string& layer_name);
+
+  /**
+   * Free all FP16 weights
+   */
+  void clear_all();
+
+  /**
+   * Get total memory used by FP16 weights
+   */
+  size_t get_memory_usage() const { return total_memory_; }
+
+private:
+  struct WeightData {
+    half* device_ptr = nullptr;
+    size_t size = 0;
+  };
+
+  std::unordered_map<std::string, WeightData> weights_;
+  std::unordered_map<std::string, WeightData> biases_;
+  size_t total_memory_ = 0;
+};
+
+} // namespace CUDA
+} // namespace GPU
+} // namespace MetalFish
+
+#endif // USE_CUDA
+#endif // CUDA_FP16_WEIGHTS_H
diff --git a/src/gpu/cuda/cuda_graphs.cu b/src/gpu/cuda/cuda_graphs.cu
new file mode 100644
index 00000000..ec0eb856
--- /dev/null
+++ b/src/gpu/cuda/cuda_graphs.cu
@@ -0,0 +1,132 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  CUDA Graphs Implementation
+*/
+
+#ifdef USE_CUDA
+
+#include "cuda_graphs.h"
+#include <iostream>
+
+namespace MetalFish {
+namespace GPU {
+namespace CUDA {
+
+GraphManager::~GraphManager() {
+  clear_all();
+}
+
+bool GraphManager::begin_capture(cudaStream_t stream, const std::string& name) {
+  if (has_graph(name)) {
+    std::cerr << "[CUDA Graphs] Graph '" << name << "' already exists" << std::endl;
+    return false;
+  }
+
+  cudaError_t err = cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
+  if (err != cudaSuccess) {
+    std::cerr << "[CUDA Graphs] Failed to begin capture: " 
+              << cudaGetErrorString(err) << std::endl;
+    return false;
+  }
+
+  current_capture_name_ = name;
+  return true;
+}
+
+bool GraphManager::end_capture(cudaStream_t stream, const std::string& name) {
+  if (current_capture_name_ != name) {
+    std::cerr << "[CUDA Graphs] Capture name mismatch" << std::endl;
+    cudaStreamEndCapture(stream, nullptr);  // Abort capture
+    return false;
+  }
+
+  GraphData data;
+  cudaError_t err = cudaStreamEndCapture(stream, &data.graph);
+  if (err != cudaSuccess) {
+    std::cerr << "[CUDA Graphs] Failed to end capture: " 
+              << cudaGetErrorString(err) << std::endl;
+    return false;
+  }
+
+  // Get node count
+  cudaGraphGetNodes(data.graph, nullptr, &data.node_count);
+
+  // Instantiate the graph for execution
+  err = cudaGraphInstantiate(&data.exec, data.graph, nullptr, nullptr, 0);
+  if (err != cudaSuccess) {
+    std::cerr << "[CUDA Graphs] Failed to instantiate graph: "
+              << cudaGetErrorString(err) << std::endl;
+    cudaGraphDestroy(data.graph);
+    return false;
+  }
+
+  graphs_[name] = data;
+  current_capture_name_.clear();
+
+  std::cout << "[CUDA Graphs] Captured '" << name << "' with " 
+            << data.node_count << " nodes" << std::endl;
+  return true;
+}
+
+bool GraphManager::launch_graph(const std::string& name, cudaStream_t stream) {
+  auto it = graphs_.find(name);
+  if (it == graphs_.end()) {
+    std::cerr << "[CUDA Graphs] Graph '" << name << "' not found" << std::endl;
+    return false;
+  }
+
+  cudaError_t err = cudaGraphLaunch(it->second.exec, stream);
+  if (err != cudaSuccess) {
+    std::cerr << "[CUDA Graphs] Failed to launch graph: "
+              << cudaGetErrorString(err) << std::endl;
+    return false;
+  }
+
+  return true;
+}
+
+bool GraphManager::has_graph(const std::string& name) const {
+  return graphs_.find(name) != graphs_.end();
+}
+
+void GraphManager::remove_graph(const std::string& name) {
+  auto it = graphs_.find(name);
+  if (it != graphs_.end()) {
+    if (it->second.exec) {
+      cudaGraphExecDestroy(it->second.exec);
+    }
+    if (it->second.graph) {
+      cudaGraphDestroy(it->second.graph);
+    }
+    graphs_.erase(it);
+  }
+}
+
+void GraphManager::clear_all() {
+  for (auto& [name, data] : graphs_) {
+    if (data.exec) {
+      cudaGraphExecDestroy(data.exec);
+    }
+    if (data.graph) {
+      cudaGraphDestroy(data.graph);
+    }
+  }
+  graphs_.clear();
+}
+
+GraphManager::GraphStats GraphManager::get_stats() const {
+  GraphStats stats{0, 0};
+  stats.num_graphs = graphs_.size();
+  for (const auto& [name, data] : graphs_) {
+    stats.total_nodes += data.node_count;
+  }
+  return stats;
+}
+
+} // namespace CUDA
+} // namespace GPU
+} // namespace MetalFish
+
+#endif // USE_CUDA
diff --git a/src/gpu/cuda/cuda_graphs.h b/src/gpu/cuda/cuda_graphs.h
new file mode 100644
index 00000000..69d0362d
--- /dev/null
+++ b/src/gpu/cuda/cuda_graphs.h
@@ -0,0 +1,117 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  CUDA Graphs Support
+
+  Implements CUDA graphs for reduced kernel launch overhead.
+  CUDA graphs capture a sequence of operations and replay them efficiently.
+*/
+
+#ifndef CUDA_GRAPHS_H
+#define CUDA_GRAPHS_H
+
+#ifdef USE_CUDA
+
+#include <cuda_runtime.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+namespace MetalFish {
+namespace GPU {
+namespace CUDA {
+
+/**
+ * CUDA Graph Manager
+ * 
+ * Captures and replays sequences of CUDA operations for improved performance.
+ * Particularly useful for repetitive evaluation patterns in NNUE.
+ */
+class GraphManager {
+public:
+  GraphManager() = default;
+  ~GraphManager();
+
+  /**
+   * Begin graph capture on a stream
+   */
+  bool begin_capture(cudaStream_t stream, const std::string& name);
+
+  /**
+   * End graph capture and store the graph
+   */
+  bool end_capture(cudaStream_t stream, const std::string& name);
+
+  /**
+   * Launch a captured graph
+   */
+  bool launch_graph(const std::string& name, cudaStream_t stream);
+
+  /**
+   * Check if a graph exists
+   */
+  bool has_graph(const std::string& name) const;
+
+  /**
+   * Delete a graph
+   */
+  void remove_graph(const std::string& name);
+
+  /**
+   * Clear all graphs
+   */
+  void clear_all();
+
+  /**
+   * Get graph statistics
+   */
+  struct GraphStats {
+    size_t num_graphs;
+    size_t total_nodes;
+  };
+  GraphStats get_stats() const;
+
+private:
+  struct GraphData {
+    cudaGraph_t graph = nullptr;
+    cudaGraphExec_t exec = nullptr;
+    size_t node_count = 0;
+  };
+
+  std::unordered_map<std::string, GraphData> graphs_;
+  std::string current_capture_name_;
+};
+
+/**
+ * RAII helper for graph capture
+ */
+class ScopedGraphCapture {
+public:
+  ScopedGraphCapture(GraphManager& manager, cudaStream_t stream, 
+                     const std::string& name)
+      : manager_(manager), stream_(stream), name_(name), active_(false) {
+    active_ = manager_.begin_capture(stream_, name_);
+  }
+
+  ~ScopedGraphCapture() {
+    if (active_) {
+      manager_.end_capture(stream_, name_);
+    }
+  }
+
+  bool is_active() const { return active_; }
+
+private:
+  GraphManager& manager_;
+  cudaStream_t stream_;
+  std::string name_;
+  bool active_;
+};
+
+} // namespace CUDA
+} // namespace GPU
+} // namespace MetalFish
+
+#endif // USE_CUDA
+#endif // CUDA_GRAPHS_H
diff --git a/src/gpu/cuda/cuda_memory.cu b/src/gpu/cuda/cuda_memory.cu
new file mode 100644
index 00000000..613fe09b
--- /dev/null
+++ b/src/gpu/cuda/cuda_memory.cu
@@ -0,0 +1,439 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  CUDA Advanced Memory Management
+
+  Optimized memory management including:
+  - Unified memory with hints and prefetching
+  - Pinned memory for faster transfers
+  - Double buffering for async operations
+  - Memory pool management
+*/
+
+#ifndef CUDA_MEMORY_CU
+#define CUDA_MEMORY_CU
+
+#include <cuda_runtime.h>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+namespace MetalFish {
+namespace GPU {
+namespace CUDA {
+
+// ============================================================================
+// Unified Memory Manager
+// ============================================================================
+
+class UnifiedMemoryManager {
+public:
+  /**
+   * Allocate unified memory with optimal hints
+   */
+  static void *allocate_unified(size_t size, int device_id) {
+    void *ptr = nullptr;
+    cudaError_t err = cudaMallocManaged(&ptr, size);
+    
+    if (err != cudaSuccess) {
+      std::cerr << "[CUDA Memory] Failed to allocate unified memory: "
+                << cudaGetErrorString(err) << std::endl;
+      return nullptr;
+    }
+    
+    // Set memory access hints for better performance
+    cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, device_id);
+    cudaMemAdvise(ptr, size, cudaMemAdviseSetAccessedBy, device_id);
+    cudaMemAdvise(ptr, size, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId);
+    
+    return ptr;
+  }
+  
+  /**
+   * Allocate unified memory with read-mostly hint
+   * Useful for weight buffers that are rarely modified
+   */
+  static void *allocate_unified_readonly(size_t size, int device_id) {
+    void *ptr = allocate_unified(size, device_id);
+    
+    if (ptr) {
+      // Mark as read-mostly for better caching
+      cudaMemAdvise(ptr, size, cudaMemAdviseSetReadMostly, device_id);
+    }
+    
+    return ptr;
+  }
+  
+  /**
+   * Prefetch data to device asynchronously
+   */
+  static void prefetch_to_device(void *ptr, size_t size, int device_id, 
+                                 cudaStream_t stream = 0) {
+    cudaMemPrefetchAsync(ptr, size, device_id, stream);
+  }
+  
+  /**
+   * Prefetch data to CPU asynchronously
+   */
+  static void prefetch_to_host(void *ptr, size_t size, 
+                               cudaStream_t stream = 0) {
+    cudaMemPrefetchAsync(ptr, size, cudaCpuDeviceId, stream);
+  }
+  
+  /**
+   * Free unified memory
+   */
+  static void free_unified(void *ptr) {
+    if (ptr) {
+      cudaFree(ptr);
+    }
+  }
+};
+
+// ============================================================================
+// Pinned Memory Manager
+// ============================================================================
+
+class PinnedMemoryManager {
+public:
+  /**
+   * Allocate pinned (page-locked) host memory
+   * Provides faster CPU-GPU transfers
+   */
+  static void *allocate_pinned(size_t size) {
+    void *ptr = nullptr;
+    cudaError_t err = cudaMallocHost(&ptr, size);
+    
+    if (err != cudaSuccess) {
+      std::cerr << "[CUDA Memory] Failed to allocate pinned memory: "
+                << cudaGetErrorString(err) << std::endl;
+      return nullptr;
+    }
+    
+    return ptr;
+  }
+  
+  /**
+   * Register existing host memory as pinned
+   * Useful for making existing allocations DMA-capable
+   */
+  static bool register_pinned(void *ptr, size_t size) {
+    cudaError_t err = cudaHostRegister(ptr, size, cudaHostRegisterDefault);
+    
+    if (err != cudaSuccess) {
+      std::cerr << "[CUDA Memory] Failed to register pinned memory: "
+                << cudaGetErrorString(err) << std::endl;
+      return false;
+    }
+    
+    return true;
+  }
+  
+  /**
+   * Unregister pinned memory
+   */
+  static void unregister_pinned(void *ptr) {
+    if (ptr) {
+      cudaHostUnregister(ptr);
+    }
+  }
+  
+  /**
+   * Free pinned memory
+   */
+  static void free_pinned(void *ptr) {
+    if (ptr) {
+      cudaFreeHost(ptr);
+    }
+  }
+};
+
+// ============================================================================
+// Double Buffer for Async Operations
+// ============================================================================
+
+template <typename T>
+class DoubleBuffer {
+public:
+  DoubleBuffer(size_t size, int device_id) 
+      : size_(size), device_id_(device_id), current_buffer_(0),
+        host_buffers_{nullptr, nullptr}, device_buffers_{nullptr, nullptr},
+        compute_stream_(nullptr), copy_stream_(nullptr), valid_(false) {
+    
+    // Allocate two pinned host buffers
+    host_buffers_[0] = static_cast<T*>(PinnedMemoryManager::allocate_pinned(size * sizeof(T)));
+    if (!host_buffers_[0]) return;
+    
+    host_buffers_[1] = static_cast<T*>(PinnedMemoryManager::allocate_pinned(size * sizeof(T)));
+    if (!host_buffers_[1]) return;
+    
+    // Allocate device buffers
+    if (cudaMalloc(&device_buffers_[0], size * sizeof(T)) != cudaSuccess) return;
+    if (cudaMalloc(&device_buffers_[1], size * sizeof(T)) != cudaSuccess) return;
+    
+    // Create streams for concurrent operations
+    if (cudaStreamCreate(&compute_stream_) != cudaSuccess) return;
+    if (cudaStreamCreate(&copy_stream_) != cudaSuccess) return;
+    
+    valid_ = true;
+  }
+  
+  ~DoubleBuffer() {
+    // Free host buffers (check for nullptr in case construction failed partway)
+    if (host_buffers_[0]) PinnedMemoryManager::free_pinned(host_buffers_[0]);
+    if (host_buffers_[1]) PinnedMemoryManager::free_pinned(host_buffers_[1]);
+    
+    // Free device buffers
+    if (device_buffers_[0]) cudaFree(device_buffers_[0]);
+    if (device_buffers_[1]) cudaFree(device_buffers_[1]);
+    
+    // Destroy streams
+    if (compute_stream_) cudaStreamDestroy(compute_stream_);
+    if (copy_stream_) cudaStreamDestroy(copy_stream_);
+  }
+  
+  /**
+   * Get current host buffer for writing
+   */
+  T *get_host_buffer() {
+    return host_buffers_[current_buffer_];
+  }
+  
+  /**
+   * Get current device buffer for compute
+   */
+  T *get_device_buffer() {
+    return device_buffers_[current_buffer_];
+  }
+  
+  /**
+   * Swap buffers and initiate async transfer
+   * While computing on buffer N, prefetch buffer N+1
+   */
+  void swap_and_transfer() {
+    int next_buffer = 1 - current_buffer_;
+    
+    // Copy next buffer to device asynchronously
+    cudaMemcpyAsync(device_buffers_[next_buffer], 
+                   host_buffers_[next_buffer],
+                   size_ * sizeof(T),
+                   cudaMemcpyHostToDevice,
+                   copy_stream_);
+    
+    // Swap for next iteration
+    current_buffer_ = next_buffer;
+  }
+  
+  /**
+   * Wait for all operations to complete
+   */
+  void synchronize() {
+    cudaStreamSynchronize(compute_stream_);
+    cudaStreamSynchronize(copy_stream_);
+  }
+  
+  cudaStream_t get_compute_stream() { return compute_stream_; }
+  cudaStream_t get_copy_stream() { return copy_stream_; }
+  
+private:
+  size_t size_;
+  int device_id_;
+  int current_buffer_;
+  
+  T *host_buffers_[2];
+  T *device_buffers_[2];
+  
+  cudaStream_t compute_stream_;
+  cudaStream_t copy_stream_;
+  bool valid_;
+};
+
+// ============================================================================
+// Memory Pool for Efficient Allocation
+// ============================================================================
+
+class MemoryPool {
+public:
+  MemoryPool(size_t pool_size, int device_id) 
+      : pool_size_(pool_size), device_id_(device_id), allocated_(0), pool_base_(nullptr) {
+    
+    // Allocate large contiguous block
+    cudaError_t err = cudaMalloc(&pool_base_, pool_size);
+    if (err != cudaSuccess) {
+      std::cerr << "[CUDA Memory Pool] Failed to allocate pool: "
+                << cudaGetErrorString(err) << std::endl;
+      pool_base_ = nullptr;
+    }
+  }
+  
+  ~MemoryPool() {
+    if (pool_base_) {
+      cudaFree(pool_base_);
+    }
+  }
+  
+  /**
+   * Allocate from pool (simple bump allocator)
+   */
+  void *allocate(size_t size, size_t alignment = 256) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    
+    if (!pool_base_) return nullptr;
+    
+    // Align allocation
+    size_t aligned_offset = (allocated_ + alignment - 1) & ~(alignment - 1);
+    
+    if (aligned_offset + size > pool_size_) {
+      std::cerr << "[CUDA Memory Pool] Out of pool memory" << std::endl;
+      return nullptr;
+    }
+    
+    void *ptr = static_cast<char*>(pool_base_) + aligned_offset;
+    allocated_ = aligned_offset + size;
+    
+    return ptr;
+  }
+  
+  /**
+   * Reset pool (invalidates all previous allocations)
+   */
+  void reset() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    allocated_ = 0;
+  }
+  
+  size_t get_allocated() const { return allocated_; }
+  size_t get_available() const { return pool_size_ - allocated_; }
+  
+private:
+  void *pool_base_;
+  size_t pool_size_;
+  size_t allocated_;
+  int device_id_;
+  std::mutex mutex_;
+};
+
+// ============================================================================
+// Cache-Aligned Allocator
+// ============================================================================
+
+/**
+ * Allocate memory with specific cache line alignment
+ * Important for avoiding false sharing and optimizing cache usage
+ * Note: alignment must be a power of 2
+ */
+class CacheAlignedAllocator {
+public:
+  /**
+   * Allocate device memory aligned to cache line (128 bytes default)
+   * @param size Size to allocate in bytes
+   * @param alignment Alignment in bytes (must be power of 2, default 128)
+   * @return Aligned device pointer or nullptr on failure
+   */
+  static void *allocate_aligned(size_t size, size_t alignment = 128) {
+    // Validate alignment is power of 2
+    if (alignment == 0 || (alignment & (alignment - 1)) != 0) {
+      std::cerr << "[CUDA Memory] Alignment must be a power of 2" << std::endl;
+      return nullptr;
+    }
+    
+    // CUDA allocations are already 256-byte aligned, but we can ensure it
+    void *ptr = nullptr;
+    
+    // Calculate aligned size (alignment must be power of 2)
+    size_t aligned_size = (size + alignment - 1) & ~(alignment - 1);
+    
+    cudaError_t err = cudaMalloc(&ptr, aligned_size);
+    if (err != cudaSuccess) {
+      std::cerr << "[CUDA Memory] Failed to allocate aligned memory: "
+                << cudaGetErrorString(err) << std::endl;
+      return nullptr;
+    }
+    
+    return ptr;
+  }
+  
+  static void free_aligned(void *ptr) {
+    if (ptr) {
+      cudaFree(ptr);
+    }
+  }
+};
+
+// ============================================================================
+// Async Memory Operations Helper
+// ============================================================================
+
+class AsyncMemoryOps {
+public:
+  /**
+   * Async memcpy with event synchronization
+   */
+  static void copy_async_with_event(void *dst, const void *src, size_t size,
+                                    cudaMemcpyKind kind, cudaStream_t stream,
+                                    cudaEvent_t *completion_event = nullptr) {
+    cudaMemcpyAsync(dst, src, size, kind, stream);
+    
+    if (completion_event) {
+      cudaEventRecord(*completion_event, stream);
+    }
+  }
+  
+  /**
+   * Async memset
+   */
+  static void memset_async(void *ptr, int value, size_t size, 
+                          cudaStream_t stream) {
+    cudaMemsetAsync(ptr, value, size, stream);
+  }
+  
+  /**
+   * 2D memcpy for efficient matrix transfers
+   */
+  static void copy_2d_async(void *dst, size_t dpitch,
+                           const void *src, size_t spitch,
+                           size_t width, size_t height,
+                           cudaMemcpyKind kind, cudaStream_t stream) {
+    cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
+  }
+};
+
+// ============================================================================
+// Memory Statistics
+// ============================================================================
+
+class MemoryStats {
+public:
+  static void print_memory_info(int device_id) {
+    size_t free_mem, total_mem;
+    cudaMemGetInfo(&free_mem, &total_mem);
+    
+    size_t used_mem = total_mem - free_mem;
+    
+    std::cout << "[CUDA Memory Stats] Device " << device_id << std::endl;
+    std::cout << "  Total:     " << (total_mem / (1024 * 1024)) << " MB" << std::endl;
+    std::cout << "  Used:      " << (used_mem / (1024 * 1024)) << " MB" << std::endl;
+    std::cout << "  Free:      " << (free_mem / (1024 * 1024)) << " MB" << std::endl;
+    std::cout << "  Utilization: " << (100.0 * used_mem / total_mem) << "%" << std::endl;
+  }
+  
+  static size_t get_free_memory() {
+    size_t free_mem, total_mem;
+    cudaMemGetInfo(&free_mem, &total_mem);
+    return free_mem;
+  }
+  
+  static size_t get_total_memory() {
+    size_t free_mem, total_mem;
+    cudaMemGetInfo(&free_mem, &total_mem);
+    return total_mem;
+  }
+};
+
+} // namespace CUDA
+} // namespace GPU
+} // namespace MetalFish
+
+#endif // CUDA_MEMORY_CU
diff --git a/src/gpu/cuda/cuda_memory.h b/src/gpu/cuda/cuda_memory.h
new file mode 100644
index 00000000..e340fa70
--- /dev/null
+++ b/src/gpu/cuda/cuda_memory.h
@@ -0,0 +1,183 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  CUDA Advanced Memory Management Header
+
+  Interface for optimized memory management utilities.
+*/
+
+#ifndef CUDA_MEMORY_H
+#define CUDA_MEMORY_H
+
+#include <cuda_runtime.h>
+#include <memory>
+#include <iostream>
+#include <mutex>
+#include <vector>
+
+namespace MetalFish {
+namespace GPU {
+namespace CUDA {
+
+/**
+ * Unified Memory Manager
+ * 
+ * Provides optimized unified memory allocation with hints
+ */
+class UnifiedMemoryManager {
+public:
+  static void *allocate_unified(size_t size, int device_id);
+  static void *allocate_unified_readonly(size_t size, int device_id);
+  static void prefetch_to_device(void *ptr, size_t size, int device_id, 
+                                 cudaStream_t stream = 0);
+  static void prefetch_to_host(void *ptr, size_t size, 
+                               cudaStream_t stream = 0);
+  static void free_unified(void *ptr);
+};
+
+/**
+ * Pinned Memory Manager
+ * 
+ * Manages pinned (page-locked) host memory for faster transfers
+ */
+class PinnedMemoryManager {
+public:
+  static void *allocate_pinned(size_t size);
+  static void free_pinned(void *ptr);
+};
+
+/**
+ * Double Buffer
+ * 
+ * Implements double buffering for overlapping transfers and computation
+ */
+template <typename T>
+class DoubleBuffer {
+public:
+  DoubleBuffer(size_t size, int device_id);
+  ~DoubleBuffer();
+  
+  bool is_valid() const { return valid_; }
+  T *get_host_buffer(int index) const;
+  T *get_device_buffer(int index) const;
+  void swap_buffers();
+  void transfer_to_device(int index, cudaStream_t stream);
+  void transfer_from_device(int index, cudaStream_t stream);
+  
+private:
+  T *host_buffers_[2];
+  T *device_buffers_[2];
+  cudaStream_t streams_[2];
+  size_t size_;
+  int current_index_;
+  bool valid_;
+};
+
+/**
+ * Memory Pool
+ * 
+ * Simple memory pool allocator for reducing allocation overhead
+ */
+class MemoryPool {
+public:
+  MemoryPool(size_t pool_size, int device_id);
+  ~MemoryPool();
+  
+  void *allocate(size_t size);
+  void reset();
+  size_t get_allocated() const { return allocated_; }
+  
+private:
+  void *pool_base_;
+  size_t pool_size_;
+  size_t allocated_;
+  int device_id_;
+};
+
+/**
+ * Cache-Aligned Allocator
+ * 
+ * Allocates memory with specified alignment for optimal cache performance
+ */
+class CacheAlignedAllocator {
+public:
+  static void *allocate_aligned(size_t size, size_t alignment);
+  static void free_aligned(void *ptr);
+};
+
+// ============================================================================
+// Template Implementation for DoubleBuffer
+// ============================================================================
+
+template <typename T>
+DoubleBuffer<T>::DoubleBuffer(size_t size, int device_id) 
+    : size_(size), current_index_(0),
+      host_buffers_{nullptr, nullptr}, device_buffers_{nullptr, nullptr},
+      streams_{nullptr, nullptr}, valid_(false) {
+  
+  // Allocate two pinned host buffers
+  host_buffers_[0] = static_cast<T*>(PinnedMemoryManager::allocate_pinned(size * sizeof(T)));
+  if (!host_buffers_[0]) return;
+  
+  host_buffers_[1] = static_cast<T*>(PinnedMemoryManager::allocate_pinned(size * sizeof(T)));
+  if (!host_buffers_[1]) return;
+  
+  // Allocate device buffers
+  if (cudaMalloc(&device_buffers_[0], size * sizeof(T)) != cudaSuccess) return;
+  if (cudaMalloc(&device_buffers_[1], size * sizeof(T)) != cudaSuccess) return;
+  
+  // Create streams for concurrent operations
+  if (cudaStreamCreate(&streams_[0]) != cudaSuccess) return;
+  if (cudaStreamCreate(&streams_[1]) != cudaSuccess) return;
+  
+  valid_ = true;
+}
+
+template <typename T>
+DoubleBuffer<T>::~DoubleBuffer() {
+  // Free host buffers (check for nullptr in case construction failed partway)
+  if (host_buffers_[0]) PinnedMemoryManager::free_pinned(host_buffers_[0]);
+  if (host_buffers_[1]) PinnedMemoryManager::free_pinned(host_buffers_[1]);
+  
+  // Free device buffers
+  if (device_buffers_[0]) cudaFree(device_buffers_[0]);
+  if (device_buffers_[1]) cudaFree(device_buffers_[1]);
+  
+  // Destroy streams
+  if (streams_[0]) cudaStreamDestroy(streams_[0]);
+  if (streams_[1]) cudaStreamDestroy(streams_[1]);
+}
+
+template <typename T>
+T *DoubleBuffer<T>::get_host_buffer(int index) const {
+  return host_buffers_[index];
+}
+
+template <typename T>
+T *DoubleBuffer<T>::get_device_buffer(int index) const {
+  return device_buffers_[index];
+}
+
+template <typename T>
+void DoubleBuffer<T>::swap_buffers() {
+  current_index_ = 1 - current_index_;
+}
+
+template <typename T>
+void DoubleBuffer<T>::transfer_to_device(int index, cudaStream_t stream) {
+  cudaMemcpyAsync(device_buffers_[index], host_buffers_[index],
+                  size_ * sizeof(T), cudaMemcpyHostToDevice, stream);
+}
+
+template <typename T>
+void DoubleBuffer<T>::transfer_from_device(int index, cudaStream_t stream) {
+  cudaMemcpyAsync(host_buffers_[index], device_buffers_[index],
+                  size_ * sizeof(T), cudaMemcpyDeviceToHost, stream);
+}
+
+} // namespace CUDA
+} // namespace GPU
+} // namespace MetalFish
+
+#endif // CUDA_MEMORY_H
diff --git a/src/gpu/cuda/cuda_multi_gpu.cu b/src/gpu/cuda/cuda_multi_gpu.cu
new file mode 100644
index 00000000..f6e679b9
--- /dev/null
+++ b/src/gpu/cuda/cuda_multi_gpu.cu
@@ -0,0 +1,226 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  Multi-GPU Implementation
+*/
+
+#ifdef USE_CUDA
+
+#include "cuda_multi_gpu.h"
+#include <iostream>
+#include <algorithm>
+
+namespace MetalFish {
+namespace GPU {
+namespace CUDA {
+
+MultiGPUManager::MultiGPUManager() : initialized_(false), original_device_(0) {
+  cudaGetDevice(&original_device_);
+}
+
+MultiGPUManager::~MultiGPUManager() {
+  if (initialized_) {
+    cudaSetDevice(original_device_);
+  }
+}
+
+bool MultiGPUManager::initialize(bool use_all) {
+  if (initialized_) {
+    return true;
+  }
+
+  int device_count = 0;
+  cudaError_t err = cudaGetDeviceCount(&device_count);
+  if (err != cudaSuccess || device_count == 0) {
+    std::cerr << "[Multi-GPU] No CUDA devices found" << std::endl;
+    return false;
+  }
+
+  std::cout << "[Multi-GPU] Found " << device_count << " CUDA device(s)" << std::endl;
+
+  // Collect GPU information
+  std::vector<GPUInfo> all_gpus;
+  for (int i = 0; i < device_count; i++) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, i);
+
+    GPUInfo info;
+    info.device_id = i;
+    info.name = prop.name;
+    info.compute_major = prop.major;
+    info.compute_minor = prop.minor;
+    info.total_memory = prop.totalGlobalMem;
+    info.multiprocessor_count = prop.multiProcessorCount;
+    info.has_tensor_cores = (prop.major >= 7);
+    info.has_peer_access = false;
+
+    all_gpus.push_back(info);
+    
+    std::cout << "[Multi-GPU] GPU " << i << ": " << info.name 
+              << " (SM " << info.compute_major << "." << info.compute_minor << ")" << std::endl;
+  }
+
+  if (use_all) {
+    // Use all GPUs
+    gpu_info_ = all_gpus;
+  } else {
+    // Use only the best GPU
+    auto best_gpu = std::max_element(all_gpus.begin(), all_gpus.end(),
+      [](const GPUInfo& a, const GPUInfo& b) {
+        int score_a = a.compute_major * 100 + a.compute_minor;
+        int score_b = b.compute_major * 100 + b.compute_minor;
+        return score_a < score_b;
+      });
+    gpu_info_.push_back(*best_gpu);
+  }
+
+  initialized_ = true;
+  std::cout << "[Multi-GPU] Using " << gpu_info_.size() << " GPU(s)" << std::endl;
+  
+  return true;
+}
+
+const GPUInfo& MultiGPUManager::get_gpu_info(int gpu_index) const {
+  return gpu_info_[gpu_index];
+}
+
+int MultiGPUManager::get_best_gpu() const {
+  if (gpu_info_.empty()) {
+    return 0;
+  }
+
+  int best_idx = 0;
+  int best_score = gpu_info_[0].compute_major * 100 + gpu_info_[0].compute_minor;
+  
+  for (size_t i = 1; i < gpu_info_.size(); i++) {
+    int score = gpu_info_[i].compute_major * 100 + gpu_info_[i].compute_minor;
+    if (score > best_score) {
+      best_score = score;
+      best_idx = static_cast<int>(i);
+    }
+  }
+  
+  return best_idx;
+}
+
+bool MultiGPUManager::enable_peer_access() {
+  if (gpu_info_.size() < 2) {
+    return true;  // Nothing to do with single GPU
+  }
+
+  std::cout << "[Multi-GPU] Enabling peer-to-peer access..." << std::endl;
+  
+  for (size_t i = 0; i < gpu_info_.size(); i++) {
+    cudaSetDevice(gpu_info_[i].device_id);
+    
+    for (size_t j = 0; j < gpu_info_.size(); j++) {
+      if (i == j) continue;
+      
+      int can_access = 0;
+      cudaDeviceCanAccessPeer(&can_access, gpu_info_[i].device_id, 
+                              gpu_info_[j].device_id);
+      
+      if (can_access) {
+        cudaError_t err = cudaDeviceEnablePeerAccess(gpu_info_[j].device_id, 0);
+        if (err == cudaSuccess) {
+          gpu_info_[i].has_peer_access = true;
+          std::cout << "[Multi-GPU] Enabled P2P: GPU " << i << " -> GPU " << j << std::endl;
+        } else if (err != cudaErrorPeerAccessAlreadyEnabled) {
+          std::cerr << "[Multi-GPU] Failed to enable P2P: " 
+                    << cudaGetErrorString(err) << std::endl;
+        } else {
+          // Already enabled, clear the error
+          cudaGetLastError();
+        }
+      }
+    }
+  }
+  
+  cudaSetDevice(original_device_);
+  return true;
+}
+
+std::vector<int> MultiGPUManager::distribute_batch(int total_batch_size) const {
+  std::vector<int> batch_sizes(gpu_info_.size());
+  
+  if (gpu_info_.size() == 1) {
+    batch_sizes[0] = total_batch_size;
+    return batch_sizes;
+  }
+
+  // Distribute based on relative compute capability
+  std::vector<int> scores;
+  int total_score = 0;
+  
+  for (const auto& info : gpu_info_) {
+    int score = info.multiprocessor_count * (info.compute_major * 10 + info.compute_minor);
+    scores.push_back(score);
+    total_score += score;
+  }
+
+  // Distribute proportionally
+  int remaining = total_batch_size;
+  for (size_t i = 0; i < gpu_info_.size(); i++) {
+    if (i == gpu_info_.size() - 1) {
+      // Last GPU gets all remaining
+      batch_sizes[i] = remaining;
+    } else {
+      int size = (total_batch_size * scores[i]) / total_score;
+      batch_sizes[i] = size;
+      remaining -= size;
+    }
+  }
+
+  return batch_sizes;
+}
+
+bool MultiGPUManager::set_device(int gpu_index) {
+  if (gpu_index < 0 || gpu_index >= static_cast<int>(gpu_info_.size())) {
+    return false;
+  }
+
+  cudaError_t err = cudaSetDevice(gpu_info_[gpu_index].device_id);
+  return err == cudaSuccess;
+}
+
+int MultiGPUManager::get_current_device() const {
+  int device;
+  cudaGetDevice(&device);
+  
+  // Find index in our list
+  for (size_t i = 0; i < gpu_info_.size(); i++) {
+    if (gpu_info_[i].device_id == device) {
+      return static_cast<int>(i);
+    }
+  }
+  
+  return 0;
+}
+
+void MultiGPUManager::synchronize_all() {
+  int current_device;
+  cudaGetDevice(&current_device);
+  
+  for (const auto& info : gpu_info_) {
+    cudaSetDevice(info.device_id);
+    cudaDeviceSynchronize();
+  }
+  
+  cudaSetDevice(current_device);
+}
+
+ScopedDevice::ScopedDevice(int device_id) : saved_device_(0) {
+  cudaGetDevice(&saved_device_);
+  cudaSetDevice(device_id);
+}
+
+ScopedDevice::~ScopedDevice() {
+  cudaSetDevice(saved_device_);
+}
+
+} // namespace CUDA
+} // namespace GPU
+} // namespace MetalFish
+
+#endif // USE_CUDA
diff --git a/src/gpu/cuda/cuda_multi_gpu.h b/src/gpu/cuda/cuda_multi_gpu.h
new file mode 100644
index 00000000..0b0ffc2a
--- /dev/null
+++ b/src/gpu/cuda/cuda_multi_gpu.h
@@ -0,0 +1,123 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  Multi-GPU Support
+
+  Enables batch distribution across multiple NVIDIA GPUs.
+*/
+
+#ifndef CUDA_MULTI_GPU_H
+#define CUDA_MULTI_GPU_H
+
+#ifdef USE_CUDA
+
+#include <cuda_runtime.h>
+#include <vector>
+#include <memory>
+
+namespace MetalFish {
+namespace GPU {
+namespace CUDA {
+
+/**
+ * GPU Device Information
+ */
+struct GPUInfo {
+  int device_id;
+  std::string name;
+  int compute_major;
+  int compute_minor;
+  size_t total_memory;
+  int multiprocessor_count;
+  bool has_tensor_cores;
+  bool has_peer_access;
+};
+
+/**
+ * Multi-GPU Manager
+ * 
+ * Manages multiple GPUs for parallel batch processing.
+ */
+class MultiGPUManager {
+public:
+  MultiGPUManager();
+  ~MultiGPUManager();
+
+  /**
+   * Initialize multi-GPU support
+   * @param use_all If true, use all available GPUs. Otherwise, use best GPU only.
+   * @return true if at least one GPU is available
+   */
+  bool initialize(bool use_all = false);
+
+  /**
+   * Get number of active GPUs
+   */
+  int get_num_gpus() const { return static_cast<int>(gpu_info_.size()); }
+
+  /**
+   * Get GPU information
+   */
+  const GPUInfo& get_gpu_info(int gpu_index) const;
+
+  /**
+   * Get best GPU (highest compute capability)
+   */
+  int get_best_gpu() const;
+
+  /**
+   * Enable peer-to-peer access between GPUs
+   */
+  bool enable_peer_access();
+
+  /**
+   * Distribute batch across GPUs
+   * Returns the batch size for each GPU
+   */
+  std::vector<int> distribute_batch(int total_batch_size) const;
+
+  /**
+   * Set current device
+   */
+  bool set_device(int gpu_index);
+
+  /**
+   * Get current device
+   */
+  int get_current_device() const;
+
+  /**
+   * Synchronize all GPUs
+   */
+  void synchronize_all();
+
+  /**
+   * Check if multi-GPU is enabled
+   */
+  bool is_multi_gpu_enabled() const { return gpu_info_.size() > 1; }
+
+private:
+  std::vector<GPUInfo> gpu_info_;
+  bool initialized_;
+  int original_device_;
+};
+
+/**
+ * RAII helper to switch GPU device temporarily
+ */
+class ScopedDevice {
+public:
+  ScopedDevice(int device_id);
+  ~ScopedDevice();
+
+private:
+  int saved_device_;
+};
+
+} // namespace CUDA
+} // namespace GPU
+} // namespace MetalFish
+
+#endif // USE_CUDA
+#endif // CUDA_MULTI_GPU_H
diff --git a/src/gpu/cuda/cuda_profiling.h b/src/gpu/cuda/cuda_profiling.h
new file mode 100644
index 00000000..93c14824
--- /dev/null
+++ b/src/gpu/cuda/cuda_profiling.h
@@ -0,0 +1,440 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  CUDA Profiling Infrastructure
+
+  Profiling utilities including:
+  - NVTX markers for Nsight profiling
+  - Kernel timing
+  - Occupancy calculator
+  - Performance metrics collection
+*/
+
+#ifndef CUDA_PROFILING_H
+#define CUDA_PROFILING_H
+
+#include <cuda_runtime.h>
+#include <chrono>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+// NVTX profiling support (optional)
+#ifdef USE_NVTX
+#include <nvtx3/nvToolsExt.h>
+#endif
+
+namespace MetalFish {
+namespace GPU {
+namespace CUDA {
+
+// ============================================================================
+// NVTX Markers (for Nsight profiling)
+// ============================================================================
+
+class NVTXMarker {
+public:
+#ifdef USE_NVTX
+  NVTXMarker(const char *name, uint32_t color = 0xFF00FF00) {
+    nvtxEventAttributes_t eventAttrib = {0};
+    eventAttrib.version = NVTX_VERSION;
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+    eventAttrib.colorType = NVTX_COLOR_ARGB;
+    eventAttrib.color = color;
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+    eventAttrib.message.ascii = name;
+    
+    nvtxRangePushEx(&eventAttrib);
+  }
+  
+  ~NVTXMarker() {
+    nvtxRangePop();
+  }
+#else
+  NVTXMarker(const char *, uint32_t = 0) {}
+  ~NVTXMarker() {}
+#endif
+};
+
+// Convenience macro
+#define NVTX_RANGE(name) NVTXMarker _nvtx_marker(name)
+#define NVTX_RANGE_COLOR(name, color) NVTXMarker _nvtx_marker(name, color)
+
+// ============================================================================
+// Kernel Timer
+// ============================================================================
+
+class KernelTimer {
+public:
+  KernelTimer(const std::string &name, cudaStream_t stream = 0)
+      : name_(name), stream_(stream) {
+    cudaEventCreate(&start_event_);
+    cudaEventCreate(&stop_event_);
+    cudaEventRecord(start_event_, stream_);
+  }
+  
+  ~KernelTimer() {
+    cudaEventRecord(stop_event_, stream_);
+    cudaEventSynchronize(stop_event_);
+    
+    float ms = 0.0f;
+    cudaEventElapsedTime(&ms, start_event_, stop_event_);
+    
+    // Record timing with thread safety
+    {
+      std::lock_guard<std::mutex> lock(timings_mutex_);
+      timings_[name_].push_back(ms);
+    }
+    
+    cudaEventDestroy(start_event_);
+    cudaEventDestroy(stop_event_);
+  }
+  
+  // Get average time for a kernel
+  static float get_average_time(const std::string &name) {
+    std::lock_guard<std::mutex> lock(timings_mutex_);
+    auto it = timings_.find(name);
+    if (it == timings_.end() || it->second.empty()) {
+      return 0.0f;
+    }
+    
+    float sum = 0.0f;
+    for (float t : it->second) {
+      sum += t;
+    }
+    return sum / it->second.size();
+  }
+  
+  // Print all timing statistics
+  static void print_stats() {
+    std::cout << "\n[CUDA Kernel Timing Statistics]" << std::endl;
+    std::cout << "======================================" << std::endl;
+    
+    for (const auto &[name, times] : timings_) {
+      if (times.empty()) continue;
+      
+      float sum = 0.0f, min_time = times[0], max_time = times[0];
+      for (float t : times) {
+        sum += t;
+        min_time = std::min(min_time, t);
+        max_time = std::max(max_time, t);
+      }
+      float avg = sum / times.size();
+      
+      std::cout << name << ":" << std::endl;
+      std::cout << "  Calls:   " << times.size() << std::endl;
+      std::cout << "  Average: " << avg << " ms" << std::endl;
+      std::cout << "  Min:     " << min_time << " ms" << std::endl;
+      std::cout << "  Max:     " << max_time << " ms" << std::endl;
+      std::cout << "  Total:   " << sum << " ms" << std::endl;
+    }
+  }
+  
+  // Reset all timings
+  static void reset() {
+    timings_.clear();
+  }
+  
+private:
+  std::string name_;
+  cudaStream_t stream_;
+  cudaEvent_t start_event_;
+  cudaEvent_t stop_event_;
+  
+  static std::map<std::string, std::vector<float>> timings_;
+};
+
+// Convenience macro
+#define TIME_KERNEL(name, stream) KernelTimer _kernel_timer(name, stream)
+
+// ============================================================================
+// Occupancy Calculator
+// ============================================================================
+
+class OccupancyCalculator {
+public:
+  /**
+   * Calculate theoretical occupancy for a kernel
+   */
+  static float calculate_occupancy(const void *kernel, int block_size, 
+                                   size_t dynamic_smem_size = 0) {
+    int min_grid_size, optimal_block_size;
+    
+    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &optimal_block_size,
+                                       kernel, dynamic_smem_size, 0);
+    
+    // Get device properties
+    cudaDeviceProp prop;
+    int device;
+    cudaGetDevice(&device);
+    cudaGetDeviceProperties(&prop, device);
+    
+    // Calculate occupancy
+    int max_active_blocks;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, kernel,
+                                                   block_size, dynamic_smem_size);
+    
+    float occupancy = (max_active_blocks * block_size / 
+                      static_cast<float>(prop.maxThreadsPerMultiProcessor));
+    
+    return occupancy;
+  }
+  
+  /**
+   * Print occupancy information for a kernel
+   */
+  static void print_occupancy_info(const std::string &name, const void *kernel,
+                                   int block_size, size_t dynamic_smem_size = 0) {
+    float occupancy = calculate_occupancy(kernel, block_size, dynamic_smem_size);
+    
+    cudaFuncAttributes attr;
+    cudaFuncGetAttributes(&attr, kernel);
+    
+    std::cout << "\n[Occupancy Info: " << name << "]" << std::endl;
+    std::cout << "  Block Size:        " << block_size << std::endl;
+    std::cout << "  Registers/Thread:  " << attr.numRegs << std::endl;
+    std::cout << "  Shared Mem:        " << (attr.sharedSizeBytes + dynamic_smem_size) << " bytes" << std::endl;
+    std::cout << "  Occupancy:         " << (occupancy * 100.0f) << "%" << std::endl;
+    
+    // Suggest optimal block size
+    int min_grid_size, optimal_block_size;
+    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &optimal_block_size,
+                                       kernel, dynamic_smem_size, 0);
+    std::cout << "  Optimal Block Size: " << optimal_block_size << std::endl;
+  }
+  
+  /**
+   * Auto-tune block size for best occupancy
+   */
+  static int find_optimal_block_size(const void *kernel, 
+                                     size_t dynamic_smem_size = 0) {
+    int min_grid_size, optimal_block_size;
+    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &optimal_block_size,
+                                       kernel, dynamic_smem_size, 0);
+    return optimal_block_size;
+  }
+};
+
+// ============================================================================
+// Performance Metrics Collector
+// ============================================================================
+
+class PerformanceMetrics {
+public:
+  struct Metrics {
+    float kernel_time_ms = 0.0f;
+    float memory_throughput_gbps = 0.0f;
+    float compute_throughput_gflops = 0.0f;
+    float occupancy = 0.0f;
+    size_t memory_transferred = 0;
+  };
+  
+  /**
+   * Measure kernel performance
+   */
+  static Metrics measure_kernel(const std::string &name, 
+                                std::function<void()> kernel_launch,
+                                size_t memory_transferred = 0,
+                                size_t flops = 0) {
+    Metrics m;
+    
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    
+    // Warm-up
+    kernel_launch();
+    cudaDeviceSynchronize();
+    
+    // Measure
+    cudaEventRecord(start);
+    kernel_launch();
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+    
+    cudaEventElapsedTime(&m.kernel_time_ms, start, stop);
+    
+    // Calculate throughput
+    if (memory_transferred > 0 && m.kernel_time_ms > 0) {
+      float seconds = m.kernel_time_ms / 1000.0f;
+      m.memory_throughput_gbps = (memory_transferred / 1e9) / seconds;
+    }
+    
+    if (flops > 0 && m.kernel_time_ms > 0) {
+      float seconds = m.kernel_time_ms / 1000.0f;
+      m.compute_throughput_gflops = (flops / 1e9) / seconds;
+    }
+    
+    m.memory_transferred = memory_transferred;
+    
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+    
+    // Store metrics
+    metrics_[name] = m;
+    
+    return m;
+  }
+  
+  /**
+   * Print performance report
+   */
+  static void print_report() {
+    std::cout << "\n[CUDA Performance Report]" << std::endl;
+    std::cout << "================================================" << std::endl;
+    
+    for (const auto &[name, m] : metrics_) {
+      std::cout << name << ":" << std::endl;
+      std::cout << "  Time:               " << m.kernel_time_ms << " ms" << std::endl;
+      if (m.memory_throughput_gbps > 0) {
+        std::cout << "  Memory Throughput:  " << m.memory_throughput_gbps << " GB/s" << std::endl;
+      }
+      if (m.compute_throughput_gflops > 0) {
+        std::cout << "  Compute Throughput: " << m.compute_throughput_gflops << " GFLOPS" << std::endl;
+      }
+      if (m.occupancy > 0) {
+        std::cout << "  Occupancy:          " << (m.occupancy * 100.0f) << "%" << std::endl;
+      }
+      std::cout << std::endl;
+    }
+  }
+  
+  static void reset() {
+    metrics_.clear();
+  }
+  
+private:
+  static std::map<std::string, Metrics> metrics_;
+};
+
+// ============================================================================
+// CPU Timer (for comparison)
+// ============================================================================
+
+class CPUTimer {
+public:
+  CPUTimer(const std::string &name) 
+      : name_(name), start_(std::chrono::high_resolution_clock::now()) {}
+  
+  ~CPUTimer() {
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start_);
+    
+    std::cout << "[CPU Timer] " << name_ << ": " 
+              << (duration.count() / 1000.0) << " ms" << std::endl;
+  }
+  
+private:
+  std::string name_;
+  std::chrono::high_resolution_clock::time_point start_;
+};
+
+// ============================================================================
+// Bandwidth Tester
+// ============================================================================
+
+class BandwidthTester {
+public:
+  /**
+   * Measure host to device bandwidth
+   */
+  static float measure_h2d_bandwidth(size_t size) {
+    void *h_data, *d_data;
+    cudaMallocHost(&h_data, size);
+    cudaMalloc(&d_data, size);
+    
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    
+    cudaEventRecord(start);
+    cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+    
+    float ms;
+    cudaEventElapsedTime(&ms, start, stop);
+    
+    float bandwidth_gbps = (size / 1e9) / (ms / 1000.0f);
+    
+    cudaFreeHost(h_data);
+    cudaFree(d_data);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+    
+    return bandwidth_gbps;
+  }
+  
+  /**
+   * Measure device to host bandwidth
+   */
+  static float measure_d2h_bandwidth(size_t size) {
+    void *h_data, *d_data;
+    cudaMallocHost(&h_data, size);
+    cudaMalloc(&d_data, size);
+    
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    
+    cudaEventRecord(start);
+    cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+    
+    float ms;
+    cudaEventElapsedTime(&ms, start, stop);
+    
+    float bandwidth_gbps = (size / 1e9) / (ms / 1000.0f);
+    
+    cudaFreeHost(h_data);
+    cudaFree(d_data);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+    
+    return bandwidth_gbps;
+  }
+  
+  /**
+   * Print bandwidth test results
+   */
+  static void print_bandwidth_tests() {
+    std::cout << "\n[CUDA Bandwidth Tests]" << std::endl;
+    std::cout << "================================" << std::endl;
+    
+    std::vector<size_t> sizes = {
+      1 * 1024 * 1024,      // 1 MB
+      16 * 1024 * 1024,     // 16 MB
+      64 * 1024 * 1024,     // 64 MB
+      256 * 1024 * 1024     // 256 MB
+    };
+    
+    for (size_t size : sizes) {
+      float h2d = measure_h2d_bandwidth(size);
+      float d2h = measure_d2h_bandwidth(size);
+      
+      std::cout << "Size: " << (size / (1024 * 1024)) << " MB" << std::endl;
+      std::cout << "  H2D: " << h2d << " GB/s" << std::endl;
+      std::cout << "  D2H: " << d2h << " GB/s" << std::endl;
+    }
+  }
+};
+
+} // namespace CUDA
+} // namespace GPU
+} // namespace MetalFish
+
+// Initialize static members
+namespace MetalFish {
+namespace GPU {
+namespace CUDA {
+std::map<std::string, std::vector<float>> KernelTimer::timings_;
+std::mutex KernelTimer::timings_mutex_;
+std::map<std::string, PerformanceMetrics::Metrics> PerformanceMetrics::metrics_;
+} // namespace CUDA
+} // namespace GPU
+} // namespace MetalFish
+
+#endif // CUDA_PROFILING_H
diff --git a/src/gpu/cuda/kernels/nnue_persistent.cu b/src/gpu/cuda/kernels/nnue_persistent.cu
new file mode 100644
index 00000000..a0592568
--- /dev/null
+++ b/src/gpu/cuda/kernels/nnue_persistent.cu
@@ -0,0 +1,203 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  Persistent Kernels for Small Batches
+
+  Implements persistent kernels that stay resident on the GPU,
+  reducing launch overhead for small batch evaluations.
+*/
+
+#ifndef NNUE_PERSISTENT_KERNELS_CU
+#define NNUE_PERSISTENT_KERNELS_CU
+
+#ifdef USE_CUDA
+
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+#include <cstdint>
+
+namespace cg = cooperative_groups;
+
+using weight_t = int16_t;
+using layer_weight_t = int8_t;
+using accumulator_t = int32_t;
+
+constexpr int FC0_OUT = 15;
+constexpr int FC1_OUT = 32;
+constexpr int WEIGHT_SCALE_BITS = 6;
+constexpr int OUTPUT_SCALE = 16;
+
+// ============================================================================
+// Work Queue for Persistent Kernels
+// ============================================================================
+
+/**
+ * Work item for NNUE evaluation
+ */
+struct NNUEWorkItem {
+  const accumulator_t *accumulators;
+  int32_t *output;
+  int hidden_dim;
+  bool valid;
+};
+
+/**
+ * Persistent kernel for small batch NNUE evaluation
+ * Stays resident and processes work items as they arrive
+ */
+__global__ void persistent_nnue_evaluator(
+    const layer_weight_t *fc0_weights,
+    const int32_t *fc0_biases,
+    const layer_weight_t *fc1_weights,
+    const int32_t *fc1_biases,
+    const layer_weight_t *fc2_weights,
+    const int32_t *fc2_biases,
+    NNUEWorkItem *work_queue,
+    volatile int *queue_head,
+    volatile int *queue_tail,
+    int max_queue_size,
+    volatile bool *shutdown_flag) {
+  
+  __shared__ int8_t fc0_sqr[2 * 16];
+  __shared__ int8_t fc0_linear[2];
+  __shared__ int8_t fc1_out[32];
+  
+  auto grid = cg::this_grid();
+  int work_idx = blockIdx.x;
+  
+  while (true) {
+    // Check for shutdown
+    if (*shutdown_flag) {
+      break;
+    }
+    
+    // Try to get work
+    if (*queue_tail <= *queue_head) {
+      // No work available, wait briefly
+      // Use __nanosleep on SM 7.0+, busy-wait on older GPUs
+#if __CUDA_ARCH__ >= 700
+      __nanosleep(1000);  // Sleep 1 microsecond
+#else
+      // Busy-wait for compatibility with older GPUs
+      for (int i = 0; i < 100; i++) {
+        __threadfence();
+      }
+#endif
+      continue;
+    }
+    
+    // Get work item atomically
+    int item_idx = atomicAdd(const_cast<int*>(queue_head), 1);
+    if (item_idx >= *queue_tail) {
+      // Missed it, try again
+      continue;
+    }
+    
+    item_idx = item_idx % max_queue_size;
+    NNUEWorkItem work = work_queue[item_idx];
+    
+    if (!work.valid) {
+      continue;
+    }
+    
+    // Process the work item
+    const accumulator_t *white_acc = work.accumulators;
+    const accumulator_t *black_acc = white_acc + work.hidden_dim;
+    
+    // FC0 layer - simplified version for persistent kernel
+    int tid = threadIdx.x;
+    
+    // Process each perspective
+    for (int p = 0; p < 2; p++) {
+      const accumulator_t *acc = (p == 0) ? white_acc : black_acc;
+      
+      for (int out = tid; out <= FC0_OUT; out += blockDim.x) {
+        int32_t sum = fc0_biases[out];
+        
+        for (int i = 0; i < work.hidden_dim; i++) {
+          int16_t val = static_cast<int16_t>(acc[i] >> WEIGHT_SCALE_BITS);
+          int8_t clipped = static_cast<int8_t>(max(0, min(127, static_cast<int>(val))));
+          sum += clipped * fc0_weights[i * (FC0_OUT + 1) + out];
+        }
+        
+        int16_t result = static_cast<int16_t>(sum >> WEIGHT_SCALE_BITS);
+        if (out < FC0_OUT) {
+          int clamped = max(0, min(127, static_cast<int>(result)));
+          fc0_sqr[p * FC0_OUT + out] = static_cast<int8_t>((clamped * clamped) >> 7);
+        } else {
+          fc0_linear[p] = static_cast<int8_t>(max(0, min(127, static_cast<int>(result))));
+        }
+      }
+    }
+    __syncthreads();
+    
+    // FC1 layer
+    if (tid < FC1_OUT) {
+      int32_t sum = fc1_biases[tid];
+      for (int i = 0; i < 2 * FC0_OUT; i++) {
+        sum += fc0_sqr[i] * fc1_weights[i * FC1_OUT + tid];
+      }
+      fc1_out[tid] = static_cast<int8_t>(
+          max(0, min(127, static_cast<int>(sum >> WEIGHT_SCALE_BITS))));
+    }
+    __syncthreads();
+    
+    // FC2 layer with skip connection
+    if (tid == 0) {
+      int32_t sum = fc2_biases[0];
+      for (int i = 0; i < FC1_OUT; i++) {
+        sum += fc1_out[i] * fc2_weights[i];
+      }
+      
+      int32_t skip_val = ((fc0_linear[0] + fc0_linear[1]) * 600 * OUTPUT_SCALE) /
+                         (2 * 127 * (1 << WEIGHT_SCALE_BITS));
+      *work.output = sum + skip_val;
+    }
+    
+    // Mark work as complete
+    work_queue[item_idx].valid = false;
+    __syncthreads();
+  }
+}
+
+// ============================================================================
+// Host Interface
+// ============================================================================
+
+extern "C" {
+
+/**
+ * Launch persistent kernel
+ * This kernel stays resident and processes work from a queue
+ */
+void cuda_launch_persistent_evaluator(
+    const layer_weight_t *fc0_weights,
+    const int32_t *fc0_biases,
+    const layer_weight_t *fc1_weights,
+    const int32_t *fc1_biases,
+    const layer_weight_t *fc2_weights,
+    const int32_t *fc2_biases,
+    NNUEWorkItem *work_queue,
+    volatile int *queue_head,
+    volatile int *queue_tail,
+    int max_queue_size,
+    volatile bool *shutdown_flag,
+    cudaStream_t stream) {
+  
+  // Launch with moderate block size
+  dim3 block(128);
+  dim3 grid(4);  // 4 blocks for better latency hiding
+  
+  persistent_nnue_evaluator<<<grid, block, 0, stream>>>(
+      fc0_weights, fc0_biases,
+      fc1_weights, fc1_biases,
+      fc2_weights, fc2_biases,
+      work_queue, queue_head, queue_tail,
+      max_queue_size, shutdown_flag);
+}
+
+} // extern "C"
+
+#endif // USE_CUDA
+#endif // NNUE_PERSISTENT_KERNELS_CU
diff --git a/src/gpu/cuda/kernels/nnue_persistent.h b/src/gpu/cuda/kernels/nnue_persistent.h
new file mode 100644
index 00000000..1e00acf6
--- /dev/null
+++ b/src/gpu/cuda/kernels/nnue_persistent.h
@@ -0,0 +1,51 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  Persistent Kernels Header
+*/
+
+#ifndef NNUE_PERSISTENT_KERNELS_H
+#define NNUE_PERSISTENT_KERNELS_H
+
+#ifdef USE_CUDA
+
+#include <cuda_runtime.h>
+#include <cstdint>
+
+using layer_weight_t = int8_t;
+using accumulator_t = int32_t;
+
+/**
+ * Work item for NNUE evaluation
+ */
+struct NNUEWorkItem {
+  const accumulator_t *accumulators;
+  int32_t *output;
+  int hidden_dim;
+  bool valid;
+};
+
+extern "C" {
+
+/**
+ * Launch persistent kernel for small batch processing
+ */
+void cuda_launch_persistent_evaluator(
+    const layer_weight_t *fc0_weights,
+    const int32_t *fc0_biases,
+    const layer_weight_t *fc1_weights,
+    const int32_t *fc1_biases,
+    const layer_weight_t *fc2_weights,
+    const int32_t *fc2_biases,
+    NNUEWorkItem *work_queue,
+    volatile int *queue_head,
+    volatile int *queue_tail,
+    int max_queue_size,
+    volatile bool *shutdown_flag,
+    cudaStream_t stream);
+
+} // extern "C"
+
+#endif // USE_CUDA
+#endif // NNUE_PERSISTENT_KERNELS_H
diff --git a/src/gpu/cuda/kernels/nnue_simd.cu b/src/gpu/cuda/kernels/nnue_simd.cu
new file mode 100644
index 00000000..f03f635e
--- /dev/null
+++ b/src/gpu/cuda/kernels/nnue_simd.cu
@@ -0,0 +1,505 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  CUDA NNUE SIMD Kernels - Warp-Optimized
+
+  Advanced CUDA kernels using warp-level primitives for maximum performance.
+  Optimized for Volta and later architectures with independent thread scheduling.
+*/
+
+#ifndef NNUE_CUDA_SIMD_CU
+#define NNUE_CUDA_SIMD_CU
+
+#include <cstdint>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+// Cooperative groups for flexible thread synchronization
+#include <cooperative_groups.h>
+namespace cg = cooperative_groups;
+
+// ============================================================================
+// Architecture Constants
+// ============================================================================
+
+constexpr int FT_DIM_BIG = 1024;
+constexpr int FT_DIM_SMALL = 128;
+constexpr int FC0_OUT = 15;
+constexpr int FC1_OUT = 32;
+constexpr int WEIGHT_SCALE_BITS = 6;
+constexpr int OUTPUT_SCALE = 16;
+constexpr int HALFKA_DIMS = 45056;
+
+using weight_t = int16_t;
+using layer_weight_t = int8_t;
+using accumulator_t = int32_t;
+
+// ============================================================================
+// Warp-Level Reduction Primitives
+// ============================================================================
+
+/**
+ * Warp-level sum reduction using shuffle operations
+ * Much faster than shared memory reduction
+ */
+template <typename T>
+__device__ __forceinline__ T warp_reduce_sum(T val) {
+#pragma unroll
+  for (int offset = 16; offset > 0; offset /= 2) {
+    val += __shfl_down_sync(0xffffffff, val, offset);
+  }
+  return val;
+}
+
+/**
+ * Block-level sum reduction combining warp reductions
+ */
+template <typename T>
+__device__ __forceinline__ T block_reduce_sum(T val) {
+  static __shared__ T shared[32];  // One element per warp
+  
+  int lane = threadIdx.x % 32;
+  int wid = threadIdx.x / 32;
+  
+  // Reduce within warp
+  val = warp_reduce_sum(val);
+  
+  // Write reduced value to shared memory
+  if (lane == 0) {
+    shared[wid] = val;
+  }
+  __syncthreads();
+  
+  // First warp reduces across warps
+  if (wid == 0) {
+    val = (lane < blockDim.x / 32) ? shared[lane] : 0;
+    val = warp_reduce_sum(val);
+  }
+  
+  return val;
+}
+
+/**
+ * Warp-level max reduction using shuffle operations
+ */
+template <typename T>
+__device__ __forceinline__ T warp_reduce_max(T val) {
+#pragma unroll
+  for (int offset = 16; offset > 0; offset /= 2) {
+    val = max(val, __shfl_down_sync(0xffffffff, val, offset));
+  }
+  return val;
+}
+
+// ============================================================================
+// Activation Functions
+// ============================================================================
+
+__device__ __forceinline__ int8_t clipped_relu(int16_t x) {
+  return static_cast<int8_t>(max(0, min(127, static_cast<int>(x))));
+}
+
+__device__ __forceinline__ int8_t sqr_clipped_relu(int16_t x) {
+  int clamped = max(0, min(127, static_cast<int>(x)));
+  return static_cast<int8_t>((clamped * clamped) >> 7);
+}
+
+// ============================================================================
+// Feature Extraction with Ballot Sync
+// ============================================================================
+
+/**
+ * Extract HalfKA features using warp ballot for efficient bitboard processing
+ * Uses __ballot_sync to find active lanes with pieces
+ */
+__global__ void extract_halfka_features_simd(
+    const uint64_t *__restrict__ piece_bitboards,
+    const uint8_t *__restrict__ king_squares,
+    int32_t *__restrict__ white_features,
+    int32_t *__restrict__ black_features,
+    uint32_t *__restrict__ feature_counts,
+    int batch_size, int max_features) {
+  
+  int pos_idx = blockIdx.x;
+  if (pos_idx >= batch_size) return;
+  
+  int lane = threadIdx.x % 32;
+  int warp_id = threadIdx.x / 32;
+  
+  __shared__ int white_count_shared;
+  __shared__ int black_count_shared;
+  
+  if (threadIdx.x == 0) {
+    white_count_shared = 0;
+    black_count_shared = 0;
+  }
+  __syncthreads();
+  
+  int white_ksq = king_squares[pos_idx * 2];
+  int black_ksq = king_squares[pos_idx * 2 + 1];
+  
+  // Each warp processes a subset of piece types
+  int color = warp_id / 3;
+  int pt = (warp_id % 3) * 2 + 1;
+  
+  if (color < 2 && pt <= 6) {
+    uint64_t bb = piece_bitboards[pos_idx * 14 + color * 7 + pt];
+    
+    // Each lane processes potential squares
+    int sq_base = lane * 2;
+    for (int sq_off = 0; sq_off < 2; sq_off++) {
+      int sq = sq_base + sq_off;
+      if (sq < 64 && (bb & (1ULL << sq))) {
+        // White perspective
+        int oriented_ksq_w = white_ksq ^ ((white_ksq & 4) ? 7 : 0);
+        int oriented_sq_w = sq ^ ((white_ksq & 4) ? 7 : 0);
+        int piece_idx_w = (pt - 1) + (color != 0 ? 6 : 0);
+        int white_feat = oriented_ksq_w * 640 + piece_idx_w * 64 + oriented_sq_w;
+        
+        if (white_feat >= 0 && white_feat < HALFKA_DIMS) {
+          int idx = atomicAdd(&white_count_shared, 1);
+          if (idx < max_features) {
+            white_features[pos_idx * max_features + idx] = white_feat;
+          }
+        }
+        
+        // Black perspective
+        int black_ksq_mir = black_ksq ^ 56;
+        int oriented_ksq_b = black_ksq_mir ^ ((black_ksq_mir & 4) ? 7 : 0);
+        int sq_mir = sq ^ 56;
+        int oriented_sq_b = sq_mir ^ ((black_ksq_mir & 4) ? 7 : 0);
+        int piece_idx_b = (pt - 1) + ((color ^ 1) != 0 ? 6 : 0);
+        int black_feat = oriented_ksq_b * 640 + piece_idx_b * 64 + oriented_sq_b;
+        
+        if (black_feat >= 0 && black_feat < HALFKA_DIMS) {
+          int idx = atomicAdd(&black_count_shared, 1);
+          if (idx < max_features) {
+            black_features[pos_idx * max_features + idx] = black_feat;
+          }
+        }
+      }
+    }
+  }
+  __syncthreads();
+  
+  if (threadIdx.x == 0) {
+    feature_counts[pos_idx * 2] = white_count_shared;
+    feature_counts[pos_idx * 2 + 1] = black_count_shared;
+  }
+}
+
+// ============================================================================
+// Feature Transform with Warp Shuffle
+// ============================================================================
+
+/**
+ * Feature transform using advanced warp shuffle for feature broadcast
+ * Achieves better memory coalescing than standard approach
+ */
+__global__ void feature_transform_simd(
+    const weight_t *__restrict__ weights,
+    const weight_t *__restrict__ biases,
+    const int32_t *__restrict__ features,
+    const uint32_t *__restrict__ feature_counts,
+    accumulator_t *__restrict__ accumulators,
+    int hidden_dim, int batch_size, int max_features_per_pos) {
+  
+  int pos_idx = blockIdx.y;
+  if (pos_idx >= batch_size) return;
+  
+  auto block = cg::this_thread_block();
+  auto warp = cg::tiled_partition<32>(block);
+  
+  int warp_id = threadIdx.x / 32;
+  int lane = threadIdx.x % 32;
+  
+  // Each warp processes 32 hidden dimensions
+  int hidden_base = (blockIdx.x * (blockDim.x / 32) + warp_id) * 32;
+  int hidden_idx = hidden_base + lane;
+  
+  if (hidden_idx >= hidden_dim) return;
+  
+  // Start with bias
+  accumulator_t acc = static_cast<accumulator_t>(biases[hidden_idx]);
+  
+  // Feature counts are stored as [white, black] for each position
+  // For now, we process white features (index 0). This should be extended
+  // to handle both perspectives or the caller should specify which perspective.
+  int count = feature_counts[pos_idx * 2];  // Use white features
+  const int32_t *pos_features = features + pos_idx * max_features_per_pos;
+  
+  // Process features with warp-level cooperation
+  // Note: Broadcasting one feature at a time provides good coalesced access
+  // to weights. Alternative approaches (shared memory or processing multiple
+  // features) trade off register pressure and may not improve performance.
+  // This simple approach keeps registers low and allows high occupancy.
+  for (int i = 0; i < count; i++) {
+    // Lane 0 reads the feature index
+    int32_t feat_idx = (lane == 0) ? pos_features[i] : 0;
+    
+    // Broadcast to all lanes in warp using shuffle
+    feat_idx = warp.shfl(feat_idx, 0);
+    
+    if (feat_idx >= 0 && feat_idx < HALFKA_DIMS) {
+      // All lanes read coalesced weight access
+      // Each thread reads weights[feat_idx * hidden_dim + hidden_idx]
+      // where hidden_idx is unique per thread (hidden_base + lane)
+      // This ensures perfect coalescing across the warp
+      acc += weights[feat_idx * hidden_dim + hidden_idx];
+    }
+  }
+  
+  accumulators[pos_idx * hidden_dim + hidden_idx] = acc;
+}
+
+// ============================================================================
+// FC Layer with Warp Reduction
+// ============================================================================
+
+/**
+ * Fully connected layer using warp-level sum reduction
+ * Much faster than atomic operations or shared memory
+ */
+__global__ void fc_layer_simd(
+    const int8_t *__restrict__ input,
+    const layer_weight_t *__restrict__ weights,
+    const int32_t *__restrict__ biases,
+    int8_t *__restrict__ output,
+    int input_size, int output_size, int batch_size) {
+  
+  int pos_idx = blockIdx.x;
+  int out_idx = blockIdx.y;
+  
+  if (pos_idx >= batch_size || out_idx >= output_size) return;
+  
+  const int8_t *in_ptr = input + pos_idx * input_size;
+  const layer_weight_t *w_ptr = weights + out_idx * input_size;
+  
+  // Each thread processes a subset of inputs
+  int32_t partial_sum = 0;
+  for (int i = threadIdx.x; i < input_size; i += blockDim.x) {
+    partial_sum += static_cast<int32_t>(in_ptr[i]) * w_ptr[i];
+  }
+  
+  // Warp-level reduction
+  partial_sum = warp_reduce_sum(partial_sum);
+  
+  // First thread in each warp writes to shared memory
+  __shared__ int32_t warp_sums[32];
+  int lane = threadIdx.x % 32;
+  int wid = threadIdx.x / 32;
+  
+  if (lane == 0) {
+    warp_sums[wid] = partial_sum;
+  }
+  __syncthreads();
+  
+  // Final reduction by first warp
+  if (wid == 0) {
+    partial_sum = (lane < blockDim.x / 32) ? warp_sums[lane] : 0;
+    partial_sum = warp_reduce_sum(partial_sum);
+    
+    if (lane == 0) {
+      partial_sum += biases[out_idx];
+      output[pos_idx * output_size + out_idx] = 
+          clipped_relu(static_cast<int16_t>(partial_sum >> WEIGHT_SCALE_BITS));
+    }
+  }
+}
+
+// ============================================================================
+// Batch Evaluation with Cooperative Groups
+// ============================================================================
+
+/**
+ * Complete NNUE evaluation using cooperative groups
+ * Enables better thread cooperation and grid-wide synchronization
+ */
+__global__ void batch_evaluate_simd(
+    const accumulator_t *__restrict__ accumulators,
+    const layer_weight_t *__restrict__ fc0_weights,
+    const int32_t *__restrict__ fc0_biases,
+    const layer_weight_t *__restrict__ fc1_weights,
+    const int32_t *__restrict__ fc1_biases,
+    const layer_weight_t *__restrict__ fc2_weights,
+    const int32_t *__restrict__ fc2_biases,
+    int32_t *__restrict__ output,
+    int hidden_dim, int batch_size) {
+  
+  auto grid = cg::this_grid();
+  auto block = cg::this_thread_block();
+  auto warp = cg::tiled_partition<32>(block);
+  
+  int pos_idx = blockIdx.x;
+  if (pos_idx >= batch_size) return;
+  
+  __shared__ int8_t fc0_sqr[2 * 16];
+  __shared__ int8_t fc0_linear[2];
+  __shared__ int8_t fc1_out[32];
+  
+  const accumulator_t *white_acc = accumulators + pos_idx * 2 * hidden_dim;
+  const accumulator_t *black_acc = white_acc + hidden_dim;
+  
+  int lane = threadIdx.x % 32;
+  int wid = threadIdx.x / 32;
+  
+  // FC0 layer - process both perspectives in parallel with warp-level cooperation
+  for (int p = 0; p < 2; p++) {
+    const accumulator_t *acc = (p == 0) ? white_acc : black_acc;
+    
+    // Each warp cooperatively computes all FC0 outputs
+    for (int out = 0; out <= FC0_OUT; ++out) {
+      // Lane 0 starts from bias; other lanes start from 0 to avoid double-counting
+      int32_t sum = (lane == 0) ? fc0_biases[out] : 0;
+      
+      // Warp-level reduction over hidden dims: strided accumulation per lane
+      for (int i = lane; i < hidden_dim; i += 32) {
+        int8_t clipped = clipped_relu(
+            static_cast<int16_t>(acc[i] >> WEIGHT_SCALE_BITS));
+        sum += clipped * fc0_weights[i * (FC0_OUT + 1) + out];
+      }
+      
+      // Reduce partial sums across the warp
+      sum = warp_reduce_sum(sum);
+      
+      if (lane == 0) {
+        int16_t result = static_cast<int16_t>(sum >> WEIGHT_SCALE_BITS);
+        if (out < FC0_OUT) {
+          fc0_sqr[p * FC0_OUT + out] = sqr_clipped_relu(result);
+        } else {
+          fc0_linear[p] = clipped_relu(result);
+        }
+      }
+    }
+  }
+  block.sync();
+  
+  // FC1 layer
+  if (lane < FC1_OUT) {
+    int32_t sum = fc1_biases[lane];
+    for (int i = 0; i < 2 * FC0_OUT; i++) {
+      sum += fc0_sqr[i] * fc1_weights[i * FC1_OUT + lane];
+    }
+    fc1_out[lane] = clipped_relu(static_cast<int16_t>(sum >> WEIGHT_SCALE_BITS));
+  }
+  block.sync();
+  
+  // FC2 layer with skip connection
+  if (threadIdx.x == 0) {
+    int32_t sum = fc2_biases[0];
+    for (int i = 0; i < FC1_OUT; i++) {
+      sum += fc1_out[i] * fc2_weights[i];
+    }
+    
+    // Add skip connection
+    int32_t skip_val = ((fc0_linear[0] + fc0_linear[1]) * 600 * OUTPUT_SCALE) /
+                       (2 * 127 * (1 << WEIGHT_SCALE_BITS));
+    output[pos_idx] = sum + skip_val;
+  }
+}
+
+// ============================================================================
+// PSQT Accumulation with Warp Reduction
+// ============================================================================
+
+/**
+ * PSQT (Piece-Square Table) accumulation using warp primitives
+ */
+__global__ void psqt_accumulate_simd(
+    const int32_t *__restrict__ features,
+    const uint32_t *__restrict__ feature_counts,
+    const int32_t *__restrict__ psqt_weights,
+    int32_t *__restrict__ psqt_values,
+    int batch_size, int max_features, int num_buckets) {
+  
+  int pos_idx = blockIdx.x;
+  if (pos_idx >= batch_size) return;
+  
+  auto warp = cg::tiled_partition<32>(cg::this_thread_block());
+  int lane = warp.thread_rank();
+  
+  int count = feature_counts[pos_idx];
+  const int32_t *pos_features = features + pos_idx * max_features;
+  
+  // Each thread accumulates a subset of features
+  int32_t partial_sum = 0;
+  for (int i = lane; i < count; i += 32) {
+    int feat_idx = pos_features[i];
+    if (feat_idx >= 0) {
+      partial_sum += psqt_weights[feat_idx];
+    }
+  }
+  
+  // Warp-level sum reduction
+  partial_sum = warp_reduce_sum(partial_sum);
+  
+  // Lane 0 writes the result
+  if (lane == 0) {
+    psqt_values[pos_idx] = partial_sum;
+  }
+}
+
+// ============================================================================
+// Host Interface Functions
+// ============================================================================
+
+extern "C" {
+
+void cuda_feature_transform_simd(
+    const weight_t *weights, const weight_t *biases,
+    const int32_t *features, const uint32_t *feature_counts,
+    accumulator_t *accumulators, int hidden_dim, int batch_size,
+    int max_features_per_pos, cudaStream_t stream) {
+  
+  dim3 block(256);  // 8 warps per block
+  dim3 grid((hidden_dim + 255) / 256, batch_size);
+  
+  feature_transform_simd<<<grid, block, 0, stream>>>(
+      weights, biases, features, feature_counts, accumulators,
+      hidden_dim, batch_size, max_features_per_pos);
+}
+
+void cuda_fc_layer_simd(
+    const int8_t *input, const layer_weight_t *weights,
+    const int32_t *biases, int8_t *output,
+    int input_size, int output_size, int batch_size, cudaStream_t stream) {
+  
+  dim3 block(128);  // 4 warps per block
+  dim3 grid(batch_size, output_size);
+  
+  fc_layer_simd<<<grid, block, 0, stream>>>(
+      input, weights, biases, output, input_size, output_size, batch_size);
+}
+
+void cuda_batch_evaluate_simd(
+    const accumulator_t *accumulators,
+    const layer_weight_t *fc0_weights, const int32_t *fc0_biases,
+    const layer_weight_t *fc1_weights, const int32_t *fc1_biases,
+    const layer_weight_t *fc2_weights, const int32_t *fc2_biases,
+    int32_t *output, int hidden_dim, int batch_size, cudaStream_t stream) {
+  
+  dim3 block(128);
+  dim3 grid(batch_size);
+  
+  batch_evaluate_simd<<<grid, block, 0, stream>>>(
+      accumulators, fc0_weights, fc0_biases, fc1_weights, fc1_biases,
+      fc2_weights, fc2_biases, output, hidden_dim, batch_size);
+}
+
+void cuda_psqt_accumulate_simd(
+    const int32_t *features, const uint32_t *feature_counts,
+    const int32_t *psqt_weights, int32_t *psqt_values,
+    int batch_size, int max_features, int num_buckets, cudaStream_t stream) {
+  
+  dim3 block(32);  // Single warp
+  dim3 grid(batch_size);
+  
+  psqt_accumulate_simd<<<grid, block, 0, stream>>>(
+      features, feature_counts, psqt_weights, psqt_values,
+      batch_size, max_features, num_buckets);
+}
+
+} // extern "C"
+
+#endif // NNUE_CUDA_SIMD_CU
diff --git a/src/gpu/cuda/kernels/nnue_simd.h b/src/gpu/cuda/kernels/nnue_simd.h
new file mode 100644
index 00000000..11ecce4f
--- /dev/null
+++ b/src/gpu/cuda/kernels/nnue_simd.h
@@ -0,0 +1,55 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  CUDA NNUE SIMD Kernels Header
+
+  Interface for warp-optimized CUDA kernels.
+*/
+
+#ifndef NNUE_CUDA_SIMD_H
+#define NNUE_CUDA_SIMD_H
+
+#include <cuda_runtime.h>
+#include <cstdint>
+
+using weight_t = int16_t;
+using layer_weight_t = int8_t;
+using accumulator_t = int32_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Feature transform with warp shuffle optimization
+void cuda_feature_transform_simd(
+    const weight_t *weights, const weight_t *biases,
+    const int32_t *features, const uint32_t *feature_counts,
+    accumulator_t *accumulators, int hidden_dim, int batch_size,
+    int max_features_per_pos, cudaStream_t stream);
+
+// FC layer with warp reduction
+void cuda_fc_layer_simd(
+    const int8_t *input, const layer_weight_t *weights,
+    const int32_t *biases, int8_t *output,
+    int input_size, int output_size, int batch_size, cudaStream_t stream);
+
+// Batch evaluation with cooperative groups
+void cuda_batch_evaluate_simd(
+    const accumulator_t *accumulators,
+    const layer_weight_t *fc0_weights, const int32_t *fc0_biases,
+    const layer_weight_t *fc1_weights, const int32_t *fc1_biases,
+    const layer_weight_t *fc2_weights, const int32_t *fc2_biases,
+    int32_t *output, int hidden_dim, int batch_size, cudaStream_t stream);
+
+// PSQT accumulation with warp reduction
+void cuda_psqt_accumulate_simd(
+    const int32_t *features, const uint32_t *feature_counts,
+    const int32_t *psqt_weights, int32_t *psqt_values,
+    int batch_size, int max_features, int num_buckets, cudaStream_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NNUE_CUDA_SIMD_H
diff --git a/src/gpu/cuda/kernels/nnue_tensor_core.cu b/src/gpu/cuda/kernels/nnue_tensor_core.cu
new file mode 100644
index 00000000..cb91c6fa
--- /dev/null
+++ b/src/gpu/cuda/kernels/nnue_tensor_core.cu
@@ -0,0 +1,441 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  CUDA NNUE Tensor Core Kernels
+
+  Tensor core accelerated kernels using WMMA API for maximum performance
+  on Volta (SM 7.0) and later architectures.
+*/
+
+#ifndef NNUE_CUDA_TENSOR_CORE_CU
+#define NNUE_CUDA_TENSOR_CORE_CU
+
+#include <cstdint>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <iostream>
+
+// Include WMMA (Warp Matrix Multiply-Accumulate) API
+#if __CUDA_ARCH__ >= 700
+#include <mma.h>
+using namespace nvcuda::wmma;
+#endif
+
+// ============================================================================
+// Architecture Constants
+// ============================================================================
+
+constexpr int FT_DIM_BIG = 1024;
+constexpr int FT_DIM_SMALL = 128;
+constexpr int FC0_OUT = 15;
+constexpr int FC1_OUT = 32;
+constexpr int WEIGHT_SCALE_BITS = 6;
+constexpr int OUTPUT_SCALE = 16;
+
+// WMMA tile sizes (16x16x16 for FP16)
+constexpr int WMMA_M = 16;
+constexpr int WMMA_N = 16;
+constexpr int WMMA_K = 16;
+
+using layer_weight_t = int8_t;
+using accumulator_t = int32_t;
+
+// ============================================================================
+// Activation Functions
+// ============================================================================
+
+__device__ __forceinline__ int8_t clipped_relu(int16_t x) {
+  return static_cast<int8_t>(max(0, min(127, static_cast<int>(x))));
+}
+
+__device__ __forceinline__ int8_t sqr_clipped_relu(int16_t x) {
+  int clamped = max(0, min(127, static_cast<int>(x)));
+  return static_cast<int8_t>((clamped * clamped) >> 7);
+}
+
+// ============================================================================
+// FP16 Conversion Helpers
+// ============================================================================
+
+/**
+ * Convert int8 activation to half precision
+ */
+__device__ __forceinline__ half int8_to_half(int8_t x) {
+  return __int2half_rn(static_cast<int>(x));
+}
+
+/**
+ * Convert half precision back to int8 with clipping
+ */
+__device__ __forceinline__ int8_t half_to_int8_clipped(half x) {
+  int val = __half2int_rn(x);
+  return static_cast<int8_t>(max(0, min(127, val)));
+}
+
+// ============================================================================
+// Tensor Core FC Layer (FP16)
+// ============================================================================
+
+#if __CUDA_ARCH__ >= 700
+
+/**
+ * Fully connected layer using tensor cores (WMMA API)
+ * Input: [batch_size, input_size] in FP16
+ * Weights: [output_size, input_size] in FP16
+ * Output: [batch_size, output_size] in FP16
+ * 
+ * Uses 16x16x16 tiles for optimal tensor core utilization
+ */
+__global__ void fc_layer_tensor_core_fp16(
+    const half *__restrict__ input,
+    const half *__restrict__ weights,
+    const half *__restrict__ biases,
+    half *__restrict__ output,
+    int batch_size, int input_size, int output_size) {
+  
+  // Warp and lane IDs
+  int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+  int warpN = blockIdx.y;
+  
+  // Declare the fragments
+  fragment<matrix_a, WMMA_M, WMMA_N, WMMA_K, half, row_major> a_frag;
+  fragment<matrix_b, WMMA_M, WMMA_N, WMMA_K, half, col_major> b_frag;
+  fragment<accumulator, WMMA_M, WMMA_N, WMMA_K, half> c_frag;
+  
+  // Initialize the output to zero
+  fill_fragment(c_frag, __float2half(0.0f));
+  
+  // Bounds check
+  if (warpM * WMMA_M >= batch_size || warpN * WMMA_N >= output_size) {
+    return;
+  }
+  
+  // Matrix multiply: C = A * B^T
+  // A: [batch_size, input_size]
+  // B: [output_size, input_size] (transposed to col_major)
+  for (int k = 0; k < input_size; k += WMMA_K) {
+    int aRow = warpM * WMMA_M;
+    int aCol = k;
+    int bRow = k;
+    int bCol = warpN * WMMA_N;
+    
+    // Load A fragment (input activations)
+    if (aRow < batch_size && aCol < input_size) {
+      load_matrix_sync(a_frag, input + aRow * input_size + aCol, input_size);
+    }
+    
+    // Load B fragment (weights, transposed)
+    if (bCol < output_size && bRow < input_size) {
+      load_matrix_sync(b_frag, weights + bCol * input_size + bRow, input_size);
+    }
+    
+    // Perform the matrix multiply-accumulate
+    mma_sync(c_frag, a_frag, b_frag, c_frag);
+  }
+  
+  // Store the output first (WMMA handles the fragment layout automatically)
+  int cRow = warpM * WMMA_M;
+  int cCol = warpN * WMMA_N;
+  if (cRow < batch_size && cCol < output_size) {
+    store_matrix_sync(output + cRow * output_size + cCol, c_frag, 
+                      output_size, mem_row_major);
+  }
+  
+  // Add biases in global memory to avoid fragment layout assumptions
+  // Only one thread per warp does this to avoid races
+  if (biases != nullptr && threadIdx.x % 32 == 0) {
+    for (int row = 0; row < WMMA_M && (cRow + row) < batch_size; ++row) {
+      for (int col = 0; col < WMMA_N && (cCol + col) < output_size; ++col) {
+        int global_col = cCol + col;
+        int out_index = (cRow + row) * output_size + global_col;
+        output[out_index] = __hadd(output[out_index], biases[global_col]);
+      }
+    }
+  }
+}
+
+/**
+ * FC0 layer using tensor cores
+ * Converts int32 accumulators to FP16, applies tensor cores, converts back
+ */
+__global__ void fc0_layer_tensor_core(
+    const accumulator_t *__restrict__ accumulators,
+    const half *__restrict__ weights_fp16,
+    const half *__restrict__ biases_fp16,
+    int8_t *__restrict__ output_sqr,
+    int8_t *__restrict__ output_linear,
+    int hidden_dim, int batch_size) {
+  
+  extern __shared__ half shared_mem[];
+  half *input_fp16 = shared_mem;
+  half *output_fp16 = shared_mem + blockDim.x * hidden_dim;
+  
+  int pos_idx = blockIdx.x;
+  if (pos_idx >= batch_size) return;
+  
+  const accumulator_t *white_acc = accumulators + pos_idx * 2 * hidden_dim;
+  const accumulator_t *black_acc = white_acc + hidden_dim;
+  
+  // Convert both perspectives to FP16
+  for (int i = threadIdx.x; i < 2 * hidden_dim; i += blockDim.x) {
+    const accumulator_t *acc = (i < hidden_dim) ? white_acc : black_acc;
+    int idx = (i < hidden_dim) ? i : i - hidden_dim;
+    
+    // Apply clipped ReLU and convert to FP16
+    int16_t val = static_cast<int16_t>(acc[idx] >> WEIGHT_SCALE_BITS);
+    int8_t clipped = clipped_relu(val);
+    input_fp16[i] = __int2half_rn(clipped);
+  }
+  __syncthreads();
+  
+  // Compute dot product between input_fp16 (length 2 * hidden_dim) and
+  // weights_fp16 row for this output, using warp-level primitives
+  // Note: This version avoids WMMA misuse and uses simple FP16 operations
+  int warp_id = threadIdx.x / 32;
+  int lane = threadIdx.x % 32;
+  
+  if (warp_id < (FC0_OUT + 1)) {
+    int out_idx = warp_id;
+    
+    // Each thread in the warp accumulates over a strided subset of features
+    half local_sum = __float2half(0.0f);
+    for (int k = lane; k < 2 * hidden_dim; k += warpSize) {
+      half in_val = input_fp16[k];
+      half w_val = weights_fp16[out_idx * 2 * hidden_dim + k];
+      local_sum = __hadd(local_sum, __hmul(in_val, w_val));
+    }
+    
+    // Warp-level reduction to get total sum
+    for (int offset = 16; offset > 0; offset /= 2) {
+      local_sum = __hadd(local_sum, __shfl_down_sync(0xffffffff, local_sum, offset));
+    }
+    
+    // Only lane 0 has the final sum, add bias and store
+    if (lane == 0) {
+      local_sum = __hadd(local_sum, biases_fp16[out_idx]);
+      int16_t result = __half2int_rn(local_sum);
+      
+      // Store squared and linear outputs
+      if (out_idx < FC0_OUT) {
+        output_sqr[pos_idx * 2 * FC0_OUT + out_idx] = sqr_clipped_relu(result);
+        output_sqr[pos_idx * 2 * FC0_OUT + FC0_OUT + out_idx] = sqr_clipped_relu(result);
+      } else {
+        output_linear[pos_idx * 2] = clipped_relu(result);
+        output_linear[pos_idx * 2 + 1] = clipped_relu(result);
+      }
+    }
+  }
+}
+
+/**
+ * Fused NNUE evaluation using tensor cores throughout
+ */
+__global__ void nnue_forward_tensor_core(
+    const accumulator_t *__restrict__ accumulators,
+    const half *__restrict__ fc0_weights,
+    const half *__restrict__ fc0_biases,
+    const half *__restrict__ fc1_weights,
+    const half *__restrict__ fc1_biases,
+    const half *__restrict__ fc2_weights,
+    const half *__restrict__ fc2_biases,
+    int32_t *__restrict__ output,
+    int hidden_dim, int batch_size) {
+  
+  extern __shared__ half shared_mem[];
+  
+  int pos_idx = blockIdx.x;
+  if (pos_idx >= batch_size) return;
+  
+  half *fc0_input = shared_mem;
+  half *fc0_output = shared_mem + 2 * hidden_dim;
+  half *fc1_output = fc0_output + 2 * (FC0_OUT + 1);
+  
+  const accumulator_t *white_acc = accumulators + pos_idx * 2 * hidden_dim;
+  const accumulator_t *black_acc = white_acc + hidden_dim;
+  
+  // Convert accumulators to FP16
+  for (int i = threadIdx.x; i < 2 * hidden_dim; i += blockDim.x) {
+    const accumulator_t *acc = (i < hidden_dim) ? white_acc : black_acc;
+    int idx = (i < hidden_dim) ? i : i - hidden_dim;
+    int16_t val = static_cast<int16_t>(acc[idx] >> WEIGHT_SCALE_BITS);
+    fc0_input[i] = __int2half_rn(clipped_relu(val));
+  }
+  __syncthreads();
+  
+  // FC0 layer with tensor cores (simplified)
+  // ... (tensor core matrix multiply)
+  
+  // FC1 layer with tensor cores
+  // ... (tensor core matrix multiply)
+  
+  // FC2 layer (small, can use standard multiplication)
+  if (threadIdx.x == 0) {
+    half sum = fc2_biases[0];
+    for (int i = 0; i < FC1_OUT; i++) {
+      sum = __hfma(fc1_output[i], fc2_weights[i], sum);
+    }
+    output[pos_idx] = __half2int_rn(sum);
+  }
+}
+
+#endif // __CUDA_ARCH__ >= 700
+
+// ============================================================================
+// INT8 Tensor Core Support (Turing SM 7.5+)
+// ============================================================================
+
+#if __CUDA_ARCH__ >= 750
+
+/**
+ * FC layer using INT8 tensor cores (Turing and later)
+ * Provides even better performance for quantized inference
+ */
+__global__ void fc_layer_tensor_core_int8(
+    const int8_t *__restrict__ input,
+    const int8_t *__restrict__ weights,
+    const int32_t *__restrict__ biases,
+    int8_t *__restrict__ output,
+    int batch_size, int input_size, int output_size) {
+  
+  // INT8 tensor cores use 8x8x16 tiles on Turing
+  // 16x8x16 tiles on Ampere and later
+  
+  // Warp and lane IDs
+  int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+  int warpN = blockIdx.y;
+  
+  // Note: INT8 WMMA requires different fragment types
+  // This is a simplified example - full implementation would use
+  // appropriate fragment types for INT8
+  
+  // Bounds check
+  if (warpM * 16 >= batch_size || warpN * 16 >= output_size) {
+    return;
+  }
+  
+  // INT8 tensor core implementation would go here
+  // For now, this serves as a placeholder for future optimization
+}
+
+#endif // __CUDA_ARCH__ >= 750
+
+// ============================================================================
+// Host Interface Functions
+// ============================================================================
+
+extern "C" {
+
+/**
+ * Check if tensor cores are available on the current device
+ */
+bool cuda_tensor_cores_available(int device_id) {
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, device_id);
+  // Tensor cores available on SM 7.0 (Volta) and later
+  return prop.major >= 7;
+}
+
+/**
+ * Check if INT8 tensor cores are available
+ */
+bool cuda_int8_tensor_cores_available(int device_id) {
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, device_id);
+  // INT8 tensor cores available on SM 7.5 (Turing) and later
+  return (prop.major > 7) || (prop.major == 7 && prop.minor >= 5);
+}
+
+// Tensor core function implementations are architecture-specific
+// and must be compiled with appropriate -arch flags
+
+/**
+ * FC layer with FP16 tensor cores
+ * Only available when compiled for SM 7.0+
+ */
+void cuda_fc_layer_tensor_core_fp16(
+    const half *input, const half *weights, const half *biases,
+    half *output, int batch_size, int input_size, int output_size,
+    cudaStream_t stream) {
+  
+  // Runtime check for architecture support
+  int device;
+  cudaGetDevice(&device);
+  if (!cuda_tensor_cores_available(device)) {
+    std::cerr << "[CUDA] Tensor cores not available on this device" << std::endl;
+    return;
+  }
+  
+  dim3 block(128);  // 4 warps per block
+  dim3 grid((batch_size + 15) / 16,  // WMMA_M = 16
+            (output_size + 15) / 16); // WMMA_N = 16
+  
+  // Launch the kernel - it will be compiled for all architectures in CMAKE_CUDA_ARCHITECTURES
+  // The kernel code is conditionally compiled based on __CUDA_ARCH__ during device compilation
+  fc_layer_tensor_core_fp16<<<grid, block, 0, stream>>>(
+      input, weights, biases, output, batch_size, input_size, output_size);
+}
+
+/**
+ * FC0 layer with tensor cores
+ * Only available when compiled for SM 7.0+
+ */
+void cuda_fc0_layer_tensor_core(
+    const accumulator_t *accumulators,
+    const half *weights_fp16, const half *biases_fp16,
+    int8_t *output_sqr, int8_t *output_linear,
+    int hidden_dim, int batch_size, cudaStream_t stream) {
+  
+  int device;
+  cudaGetDevice(&device);
+  if (!cuda_tensor_cores_available(device)) {
+    std::cerr << "[CUDA] Tensor cores not available on this device" << std::endl;
+    return;
+  }
+  
+  dim3 block(128);
+  dim3 grid(batch_size);
+  size_t shared_mem = (2 * hidden_dim + 2 * (FC0_OUT + 1)) * sizeof(half);
+  
+  // Launch the kernel - it will be compiled for all architectures in CMAKE_CUDA_ARCHITECTURES
+  fc0_layer_tensor_core<<<grid, block, shared_mem, stream>>>(
+      accumulators, weights_fp16, biases_fp16,
+      output_sqr, output_linear, hidden_dim, batch_size);
+}
+
+/**
+ * Full NNUE forward pass with tensor cores
+ * Note: This is a simplified implementation. Full implementation would require
+ * complete tensor core matrix operations for all layers.
+ * Only available when compiled for SM 7.0+
+ */
+void cuda_nnue_forward_tensor_core(
+    const accumulator_t *accumulators,
+    const half *fc0_weights, const half *fc0_biases,
+    const half *fc1_weights, const half *fc1_biases,
+    const half *fc2_weights, const half *fc2_biases,
+    int32_t *output, int hidden_dim, int batch_size, cudaStream_t stream) {
+  
+  int device;
+  cudaGetDevice(&device);
+  if (!cuda_tensor_cores_available(device)) {
+    std::cerr << "[CUDA] Tensor cores not available on this device" << std::endl;
+    return;
+  }
+  
+  // TODO: Implement full tensor core forward pass
+  // Currently this is a placeholder that demonstrates the API
+  // A complete implementation would:
+  // 1. Convert accumulators to FP16
+  // 2. Use tensor cores for FC0 layer (hidden_dim -> FC0_OUT)
+  // 3. Use tensor cores for FC1 layer (FC0_OUT -> FC1_OUT)
+  // 4. Use standard ops for FC2 (small output, not worth tensor cores)
+  // 5. Apply activations and skip connections
+  
+  std::cerr << "[CUDA] Full tensor core forward pass not yet implemented" << std::endl;
+  std::cerr << "[CUDA] Use individual layer functions instead" << std::endl;
+}
+
+} // extern "C"
+
+#endif // NNUE_CUDA_TENSOR_CORE_CU
diff --git a/src/gpu/cuda/kernels/nnue_tensor_core.h b/src/gpu/cuda/kernels/nnue_tensor_core.h
new file mode 100644
index 00000000..197b04fe
--- /dev/null
+++ b/src/gpu/cuda/kernels/nnue_tensor_core.h
@@ -0,0 +1,55 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  CUDA NNUE Tensor Core Kernels Header
+
+  Interface for tensor core accelerated kernels.
+*/
+
+#ifndef NNUE_CUDA_TENSOR_CORE_H
+#define NNUE_CUDA_TENSOR_CORE_H
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cstdint>
+
+using accumulator_t = int32_t;
+using layer_weight_t = int8_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Check if tensor cores are available
+bool cuda_tensor_cores_available(int device_id);
+
+// Check if INT8 tensor cores are available
+bool cuda_int8_tensor_cores_available(int device_id);
+
+// FC layer with FP16 tensor cores
+void cuda_fc_layer_tensor_core_fp16(
+    const half *input, const half *weights, const half *biases,
+    half *output, int batch_size, int input_size, int output_size,
+    cudaStream_t stream);
+
+// FC0 layer with tensor cores
+void cuda_fc0_layer_tensor_core(
+    const accumulator_t *accumulators,
+    const half *weights_fp16, const half *biases_fp16,
+    int8_t *output_sqr, int8_t *output_linear,
+    int hidden_dim, int batch_size, cudaStream_t stream);
+
+// Full NNUE forward pass with tensor cores
+void cuda_nnue_forward_tensor_core(
+    const accumulator_t *accumulators,
+    const half *fc0_weights, const half *fc0_biases,
+    const half *fc1_weights, const half *fc1_biases,
+    const half *fc2_weights, const half *fc2_biases,
+    int32_t *output, int hidden_dim, int batch_size, cudaStream_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NNUE_CUDA_TENSOR_CORE_H
diff --git a/tests/test_cuda_advanced.cpp b/tests/test_cuda_advanced.cpp
new file mode 100644
index 00000000..ed13b2aa
--- /dev/null
+++ b/tests/test_cuda_advanced.cpp
@@ -0,0 +1,258 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  Advanced CUDA Features Test
+
+  Tests for CUDA graphs, multi-GPU, persistent kernels, and FP16 weights.
+*/
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+#ifdef USE_CUDA
+
+#include "../src/gpu/cuda/cuda_backend.h"
+#include "../src/gpu/cuda/cuda_graphs.h"
+#include "../src/gpu/cuda/cuda_multi_gpu.h"
+#include "../src/gpu/cuda/cuda_fp16_weights.h"
+
+using namespace MetalFish::GPU;
+using namespace MetalFish::GPU::CUDA;
+
+// ============================================================================
+// CUDA Graphs Tests
+// ============================================================================
+
+bool test_cuda_graphs() {
+  std::cout << "\n[Test] CUDA Graphs" << std::endl;
+  
+  GraphManager manager;
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  
+  // Test graph capture
+  bool started = manager.begin_capture(stream, "test_graph");
+  if (!started) {
+    std::cerr << "  Failed to begin capture" << std::endl;
+    cudaStreamDestroy(stream);
+    return false;
+  }
+  
+  // Simulate some operations (empty kernel for test)
+  void *dummy_buffer;
+  cudaMalloc(&dummy_buffer, 1024);
+  cudaMemsetAsync(dummy_buffer, 0, 1024, stream);
+  
+  bool ended = manager.end_capture(stream, "test_graph");
+  if (!ended) {
+    std::cerr << "  Failed to end capture" << std::endl;
+    cudaFree(dummy_buffer);
+    cudaStreamDestroy(stream);
+    return false;
+  }
+  
+  // Test graph replay
+  bool launched = manager.launch_graph("test_graph", stream);
+  if (!launched) {
+    std::cerr << "  Failed to launch graph" << std::endl;
+    cudaFree(dummy_buffer);
+    cudaStreamDestroy(stream);
+    return false;
+  }
+  
+  cudaStreamSynchronize(stream);
+  
+  // Check statistics
+  auto stats = manager.get_stats();
+  std::cout << "  Graphs: " << stats.num_graphs 
+            << ", Nodes: " << stats.total_nodes << std::endl;
+  
+  cudaFree(dummy_buffer);
+  cudaStreamDestroy(stream);
+  
+  std::cout << "  PASSED" << std::endl;
+  return true;
+}
+
+// ============================================================================
+// Multi-GPU Tests
+// ============================================================================
+
+bool test_multi_gpu() {
+  std::cout << "\n[Test] Multi-GPU Support" << std::endl;
+  
+  MultiGPUManager manager;
+  
+  // Initialize with all GPUs
+  if (!manager.initialize(true)) {
+    std::cout << "  SKIPPED (no GPUs available)" << std::endl;
+    return true;
+  }
+  
+  int num_gpus = manager.get_num_gpus();
+  std::cout << "  Number of GPUs: " << num_gpus << std::endl;
+  
+  // Test GPU enumeration
+  for (int i = 0; i < num_gpus; i++) {
+    const auto& info = manager.get_gpu_info(i);
+    std::cout << "  GPU " << i << ": " << info.name 
+              << " (SM " << info.compute_major << "." << info.compute_minor << ")"
+              << std::endl;
+  }
+  
+  // Test batch distribution
+  int batch_size = 1024;
+  auto distribution = manager.distribute_batch(batch_size);
+  
+  int total = 0;
+  for (size_t i = 0; i < distribution.size(); i++) {
+    std::cout << "  GPU " << i << " gets " << distribution[i] << " items" << std::endl;
+    total += distribution[i];
+  }
+  
+  if (total != batch_size) {
+    std::cerr << "  Batch distribution mismatch: " << total << " vs " << batch_size << std::endl;
+    return false;
+  }
+  
+  // Test peer access if multiple GPUs
+  if (num_gpus > 1) {
+    manager.enable_peer_access();
+  }
+  
+  std::cout << "  PASSED" << std::endl;
+  return true;
+}
+
+// ============================================================================
+// FP16 Weights Tests
+// ============================================================================
+
+bool test_fp16_weights() {
+  std::cout << "\n[Test] FP16 Weight Storage" << std::endl;
+  
+  FP16WeightManager manager;
+  
+  // Create test weights
+  const size_t size = 1024;
+  std::vector<int16_t> int16_weights(size);
+  std::vector<int32_t> int32_biases(32);
+  
+  for (size_t i = 0; i < size; i++) {
+    int16_weights[i] = static_cast<int16_t>(i % 128);
+  }
+  
+  for (size_t i = 0; i < 32; i++) {
+    int32_biases[i] = static_cast<int32_t>(i * 64);
+  }
+  
+  // Convert to FP16
+  half* fp16_weights = manager.convert_and_store_weights(int16_weights.data(), size);
+  if (!fp16_weights) {
+    std::cerr << "  Failed to convert weights" << std::endl;
+    return false;
+  }
+  
+  half* fp16_biases = manager.convert_and_store_biases(int32_biases.data(), 32);
+  if (!fp16_biases) {
+    std::cerr << "  Failed to convert biases" << std::endl;
+    return false;
+  }
+  
+  // Verify conversion by copying back
+  std::vector<half> verify_weights(size);
+  cudaMemcpy(verify_weights.data(), fp16_weights, size * sizeof(half), 
+             cudaMemcpyDeviceToHost);
+  
+  // Check a few values
+  for (size_t i = 0; i < 10; i++) {
+    float expected = static_cast<float>(int16_weights[i]) / 64.0f;
+    float actual = __half2float(verify_weights[i]);
+    if (std::abs(expected - actual) > 0.01f) {
+      std::cerr << "  Conversion mismatch at index " << i << std::endl;
+      return false;
+    }
+  }
+  
+  size_t mem_usage = manager.get_memory_usage();
+  std::cout << "  Memory usage: " << (mem_usage / 1024) << " KB" << std::endl;
+  
+  std::cout << "  PASSED" << std::endl;
+  return true;
+}
+
+// ============================================================================
+// Backend Integration Test
+// ============================================================================
+
+bool test_backend_features() {
+  std::cout << "\n[Test] Backend Feature Integration" << std::endl;
+  
+  auto &backend = CUDABackend::instance();
+  
+  if (!backend.is_available()) {
+    std::cout << "  SKIPPED (no CUDA device)" << std::endl;
+    return true;
+  }
+  
+  // Test feature enablement
+  backend.enable_cuda_graphs(true);
+  backend.enable_multi_gpu(false);  // Keep single GPU for simplicity
+  backend.enable_persistent_kernels(false);
+  backend.enable_fp16_weights(backend.has_tensor_cores());
+  
+  std::cout << "  CUDA Graphs: " << (backend.is_cuda_graphs_enabled() ? "ON" : "OFF") << std::endl;
+  std::cout << "  Multi-GPU: " << (backend.is_multi_gpu_enabled() ? "ON" : "OFF") << std::endl;
+  std::cout << "  Persistent Kernels: " << (backend.is_persistent_kernels_enabled() ? "ON" : "OFF") << std::endl;
+  std::cout << "  FP16 Weights: " << (backend.is_fp16_weights_enabled() ? "ON" : "OFF") << std::endl;
+  
+  std::cout << "  PASSED" << std::endl;
+  return true;
+}
+
+// ============================================================================
+// Main Test Runner
+// ============================================================================
+
+int main() {
+  std::cout << "======================================" << std::endl;
+  std::cout << "Advanced CUDA Features Tests" << std::endl;
+  std::cout << "======================================" << std::endl;
+  
+  int passed = 0;
+  int failed = 0;
+  
+  // Run tests
+  if (test_cuda_graphs()) passed++; else failed++;
+  if (test_multi_gpu()) passed++; else failed++;
+  if (test_fp16_weights()) passed++; else failed++;
+  if (test_backend_features()) passed++; else failed++;
+  
+  // Print summary
+  std::cout << "\n======================================" << std::endl;
+  std::cout << "Test Summary" << std::endl;
+  std::cout << "======================================" << std::endl;
+  std::cout << "Passed: " << passed << std::endl;
+  std::cout << "Failed: " << failed << std::endl;
+  std::cout << "Total:  " << (passed + failed) << std::endl;
+  
+  if (failed == 0) {
+    std::cout << "\nAll tests PASSED! ✓" << std::endl;
+  } else {
+    std::cout << "\nSome tests FAILED! ✗" << std::endl;
+  }
+  
+  return (failed == 0) ? 0 : 1;
+}
+
+#else // !USE_CUDA
+
+int main() {
+  std::cout << "CUDA support not enabled. Skipping tests." << std::endl;
+  return 0;
+}
+
+#endif // USE_CUDA
diff --git a/tests/test_cuda_optimizations.cpp b/tests/test_cuda_optimizations.cpp
new file mode 100644
index 00000000..ae75943c
--- /dev/null
+++ b/tests/test_cuda_optimizations.cpp
@@ -0,0 +1,361 @@
+/*
+  MetalFish - A GPU-accelerated UCI chess engine
+  Copyright (C) 2025 Nripesh Niketan
+
+  CUDA Optimization Tests
+
+  Tests for tensor cores, warp primitives, and memory optimizations.
+*/
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+#ifdef USE_CUDA
+
+#include "../src/gpu/cuda/cuda_backend.h"
+#include "../src/gpu/cuda/cuda_memory.h"
+#include "../src/gpu/cuda/cuda_profiling.h"
+#include "../src/gpu/cuda/kernels/nnue_simd.h"
+
+#ifdef USE_CUDA_TENSOR_CORES
+#include "../src/gpu/cuda/kernels/nnue_tensor_core.h"
+#endif
+
+using namespace MetalFish::GPU;
+
+namespace {
+
+// Helper function to compare arrays with tolerance
+template <typename T>
+bool arrays_equal(const T *a, const T *b, size_t n, float tolerance = 1e-4f) {
+  for (size_t i = 0; i < n; i++) {
+    float diff = std::abs(static_cast<float>(a[i]) - static_cast<float>(b[i]));
+    if (diff > tolerance) {
+      std::cerr << "Mismatch at index " << i << ": " << a[i] << " vs " << b[i]
+                << " (diff: " << diff << ")" << std::endl;
+      return false;
+    }
+  }
+  return true;
+}
+
+} // namespace
+
+// ============================================================================
+// Memory Management Tests
+// ============================================================================
+
+bool test_unified_memory() {
+  std::cout << "\n[Test] Unified Memory with Hints" << std::endl;
+  
+  const size_t size = 1024 * 1024; // 1MB
+  int device_id = 0;
+  
+  // Test basic unified memory allocation
+  void *ptr = CUDA::UnifiedMemoryManager::allocate_unified(size, device_id);
+  if (!ptr) {
+    std::cerr << "  Failed to allocate unified memory" << std::endl;
+    return false;
+  }
+  
+  // Test read-only allocation
+  void *readonly_ptr = CUDA::UnifiedMemoryManager::allocate_unified_readonly(size, device_id);
+  if (!readonly_ptr) {
+    std::cerr << "  Failed to allocate read-only unified memory" << std::endl;
+    CUDA::UnifiedMemoryManager::free_unified(ptr);
+    return false;
+  }
+  
+  // Test prefetching
+  CUDA::UnifiedMemoryManager::prefetch_to_device(ptr, size, device_id);
+  cudaDeviceSynchronize();
+  
+  CUDA::UnifiedMemoryManager::prefetch_to_host(ptr, size);
+  cudaDeviceSynchronize();
+  
+  // Cleanup
+  CUDA::UnifiedMemoryManager::free_unified(ptr);
+  CUDA::UnifiedMemoryManager::free_unified(readonly_ptr);
+  
+  std::cout << "  PASSED" << std::endl;
+  return true;
+}
+
+bool test_pinned_memory() {
+  std::cout << "\n[Test] Pinned Memory" << std::endl;
+  
+  const size_t size = 1024 * 1024; // 1MB
+  
+  // Test pinned allocation
+  void *ptr = CUDA::PinnedMemoryManager::allocate_pinned(size);
+  if (!ptr) {
+    std::cerr << "  Failed to allocate pinned memory" << std::endl;
+    return false;
+  }
+  
+  // Test memory registration
+  std::vector<char> host_mem(size);
+  if (!CUDA::PinnedMemoryManager::register_pinned(host_mem.data(), size)) {
+    std::cerr << "  Failed to register pinned memory" << std::endl;
+    CUDA::PinnedMemoryManager::free_pinned(ptr);
+    return false;
+  }
+  
+  // Cleanup
+  CUDA::PinnedMemoryManager::unregister_pinned(host_mem.data());
+  CUDA::PinnedMemoryManager::free_pinned(ptr);
+  
+  std::cout << "  PASSED" << std::endl;
+  return true;
+}
+
+bool test_double_buffer() {
+  std::cout << "\n[Test] Double Buffer" << std::endl;
+  
+  const size_t size = 1024;
+  int device_id = 0;
+  
+  CUDA::DoubleBuffer<int> buffer(size, device_id);
+  
+  // Check if buffer was successfully initialized
+  if (!buffer.is_valid()) {
+    std::cerr << "  Failed to initialize double buffer" << std::endl;
+    return false;
+  }
+  
+  // Fill buffer with test data
+  int *host_buf = buffer.get_host_buffer();
+  if (!host_buf) {
+    std::cerr << "  Failed to get host buffer" << std::endl;
+    return false;
+  }
+  
+  for (size_t i = 0; i < size; i++) {
+    host_buf[i] = static_cast<int>(i);
+  }
+  
+  // First, we need to transfer the current buffer to device before swapping
+  cudaMemcpy(buffer.get_device_buffer(), host_buf, size * sizeof(int), cudaMemcpyHostToDevice);
+  cudaDeviceSynchronize();
+  
+  // Now swap - this prepares for the next iteration
+  buffer.swap_and_transfer();
+  buffer.synchronize();
+  
+  // The current device buffer should still have our data since we just copied it
+  int *device_buf = buffer.get_device_buffer();
+  std::vector<int> result(size);
+  cudaMemcpy(result.data(), device_buf, size * sizeof(int), cudaMemcpyDeviceToHost);
+  
+  for (size_t i = 0; i < size; i++) {
+    if (result[i] != static_cast<int>(i)) {
+      std::cerr << "  Data mismatch at index " << i << std::endl;
+      return false;
+    }
+  }
+  
+  std::cout << "  PASSED" << std::endl;
+  return true;
+}
+
+bool test_memory_pool() {
+  std::cout << "\n[Test] Memory Pool" << std::endl;
+  
+  const size_t pool_size = 10 * 1024 * 1024; // 10MB
+  int device_id = 0;
+  
+  CUDA::MemoryPool pool(pool_size, device_id);
+  
+  // Test allocations
+  void *ptr1 = pool.allocate(1024);
+  void *ptr2 = pool.allocate(2048);
+  void *ptr3 = pool.allocate(4096);
+  
+  if (!ptr1 || !ptr2 || !ptr3) {
+    std::cerr << "  Failed to allocate from pool" << std::endl;
+    return false;
+  }
+  
+  size_t allocated = pool.get_allocated();
+  if (allocated < 7168) { // 1024 + 2048 + 4096
+    std::cerr << "  Incorrect allocation size: " << allocated << std::endl;
+    return false;
+  }
+  
+  // Test reset
+  pool.reset();
+  if (pool.get_allocated() != 0) {
+    std::cerr << "  Pool reset failed" << std::endl;
+    return false;
+  }
+  
+  std::cout << "  PASSED" << std::endl;
+  return true;
+}
+
+// ============================================================================
+// Profiling Tests
+// ============================================================================
+
+bool test_kernel_timer() {
+  std::cout << "\n[Test] Kernel Timer" << std::endl;
+  
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  
+  // Allocate a small buffer for the test
+  void *test_buffer;
+  cudaMalloc(&test_buffer, 1024);
+  
+  {
+    CUDA::KernelTimer timer("test_kernel", stream);
+    
+    // Simulate some work with actual operation
+    cudaMemsetAsync(test_buffer, 0, 1024, stream);
+    cudaStreamSynchronize(stream);
+  }
+  
+  float avg_time = CUDA::KernelTimer::get_average_time("test_kernel");
+  if (avg_time < 0.0f) {
+    std::cerr << "  Invalid timing result" << std::endl;
+    cudaFree(test_buffer);
+    cudaStreamDestroy(stream);
+    return false;
+  }
+  
+  cudaFree(test_buffer);
+  cudaStreamDestroy(stream);
+  
+  std::cout << "  PASSED (avg time: " << avg_time << " ms)" << std::endl;
+  return true;
+}
+
+bool test_bandwidth_measurement() {
+  std::cout << "\n[Test] Bandwidth Measurement" << std::endl;
+  
+  const size_t test_size = 64 * 1024 * 1024; // 64MB
+  
+  float h2d_bandwidth = CUDA::BandwidthTester::measure_h2d_bandwidth(test_size);
+  float d2h_bandwidth = CUDA::BandwidthTester::measure_d2h_bandwidth(test_size);
+  
+  std::cout << "  H2D Bandwidth: " << h2d_bandwidth << " GB/s" << std::endl;
+  std::cout << "  D2H Bandwidth: " << d2h_bandwidth << " GB/s" << std::endl;
+  
+  if (h2d_bandwidth <= 0.0f || d2h_bandwidth <= 0.0f) {
+    std::cerr << "  Invalid bandwidth measurements" << std::endl;
+    return false;
+  }
+  
+  std::cout << "  PASSED" << std::endl;
+  return true;
+}
+
+// ============================================================================
+// Tensor Core Tests
+// ============================================================================
+
+#ifdef USE_CUDA_TENSOR_CORES
+
+bool test_tensor_core_availability() {
+  std::cout << "\n[Test] Tensor Core Availability" << std::endl;
+  
+  int device_id = 0;
+  bool has_fp16 = cuda_tensor_cores_available(device_id);
+  bool has_int8 = cuda_int8_tensor_cores_available(device_id);
+  
+  std::cout << "  FP16 Tensor Cores: " << (has_fp16 ? "Yes" : "No") << std::endl;
+  std::cout << "  INT8 Tensor Cores: " << (has_int8 ? "Yes" : "No") << std::endl;
+  
+  // Just check that the function runs without error
+  std::cout << "  PASSED" << std::endl;
+  return true;
+}
+
+#endif // USE_CUDA_TENSOR_CORES
+
+// ============================================================================
+// Architecture Detection Tests
+// ============================================================================
+
+bool test_architecture_detection() {
+  std::cout << "\n[Test] Architecture Detection" << std::endl;
+  
+  auto &backend = CUDABackend::instance();
+  
+  if (!backend.is_available()) {
+    std::cout << "  SKIPPED (no CUDA device)" << std::endl;
+    return true;
+  }
+  
+  std::cout << "  Device: " << backend.device_name() << std::endl;
+  std::cout << "  Compute Capability: " 
+            << backend.compute_capability_major() << "." 
+            << backend.compute_capability_minor() << std::endl;
+  std::cout << "  Multiprocessors: " << backend.multiprocessor_count() << std::endl;
+  std::cout << "  Total Memory: " << (backend.total_memory() / (1024 * 1024)) << " MB" << std::endl;
+  std::cout << "  Has Tensor Cores: " << (backend.has_tensor_cores() ? "Yes" : "No") << std::endl;
+  std::cout << "  Has INT8 Tensor Cores: " << (backend.has_int8_tensor_cores() ? "Yes" : "No") << std::endl;
+  std::cout << "  Has Warp Shuffle: " << (backend.has_warp_shuffle() ? "Yes" : "No") << std::endl;
+  std::cout << "  Has Cooperative Groups: " << (backend.has_cooperative_groups() ? "Yes" : "No") << std::endl;
+  
+  std::cout << "  PASSED" << std::endl;
+  return true;
+}
+
+// ============================================================================
+// Main Test Runner
+// ============================================================================
+
+int main() {
+  std::cout << "======================================" << std::endl;
+  std::cout << "CUDA Optimization Tests" << std::endl;
+  std::cout << "======================================" << std::endl;
+  
+  int passed = 0;
+  int failed = 0;
+  
+  // Memory tests
+  if (test_unified_memory()) passed++; else failed++;
+  if (test_pinned_memory()) passed++; else failed++;
+  if (test_double_buffer()) passed++; else failed++;
+  if (test_memory_pool()) passed++; else failed++;
+  
+  // Profiling tests
+  if (test_kernel_timer()) passed++; else failed++;
+  if (test_bandwidth_measurement()) passed++; else failed++;
+  
+  // Architecture tests
+  if (test_architecture_detection()) passed++; else failed++;
+  
+#ifdef USE_CUDA_TENSOR_CORES
+  // Tensor core tests
+  if (test_tensor_core_availability()) passed++; else failed++;
+#endif
+  
+  // Print summary
+  std::cout << "\n======================================" << std::endl;
+  std::cout << "Test Summary" << std::endl;
+  std::cout << "======================================" << std::endl;
+  std::cout << "Passed: " << passed << std::endl;
+  std::cout << "Failed: " << failed << std::endl;
+  std::cout << "Total:  " << (passed + failed) << std::endl;
+  
+  if (failed == 0) {
+    std::cout << "\nAll tests PASSED! ✓" << std::endl;
+  } else {
+    std::cout << "\nSome tests FAILED! ✗" << std::endl;
+  }
+  
+  return (failed == 0) ? 0 : 1;
+}
+
+#else // !USE_CUDA
+
+int main() {
+  std::cout << "CUDA support not enabled. Skipping tests." << std::endl;
+  return 0;
+}
+
+#endif // USE_CUDA