diff --git a/CMakeLists.txt b/CMakeLists.txt index 006092d0..374d84c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,6 +114,11 @@ endif() # CUDA Configuration # ============================================================================ if(USE_CUDA) + # CUDA optimization options + option(CUDA_TENSOR_CORES "Enable tensor core kernels (Volta SM 7.0+)" ON) + option(CUDA_PROFILING "Enable NVTX profiling markers" OFF) + option(CUDA_WARP_PRIMITIVES "Enable warp-level primitive optimizations" ON) + # Find CUDA toolkit find_package(CUDAToolkit QUIET) @@ -164,6 +169,24 @@ if(USE_CUDA) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 --use_fast_math") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fPIC") + + # Enable tensor core support + if(CUDA_TENSOR_CORES) + add_definitions(-DUSE_CUDA_TENSOR_CORES) + message(STATUS " Tensor Cores: ENABLED") + endif() + + # Enable NVTX profiling + if(CUDA_PROFILING) + add_definitions(-DUSE_NVTX) + message(STATUS " NVTX Profiling: ENABLED") + endif() + + # Enable warp primitives + if(CUDA_WARP_PRIMITIVES) + add_definitions(-DUSE_CUDA_WARP_PRIMITIVES) + message(STATUS " Warp Primitives: ENABLED") + endif() # Enable separable compilation for device code set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) @@ -221,6 +244,28 @@ set(GPU_SOURCES src/gpu/gpu_nnue_integration.cpp src/gpu/gpu_mcts_backend.cpp # CUDA source files set(CUDA_SOURCES "") +if(USE_CUDA AND CUDA_AVAILABLE) + set(CUDA_SOURCES + src/gpu/cuda/cuda_backend.cu + src/gpu/cuda/cuda_memory.cu + src/gpu/cuda/kernels/nnue_kernels.cu) + + # Add advanced optimization kernels if enabled + if(CUDA_WARP_PRIMITIVES) + list(APPEND CUDA_SOURCES src/gpu/cuda/kernels/nnue_simd.cu) + endif() + + if(CUDA_TENSOR_CORES) + list(APPEND CUDA_SOURCES src/gpu/cuda/kernels/nnue_tensor_core.cu) + endif() + + # Add advanced features + list(APPEND CUDA_SOURCES + src/gpu/cuda/cuda_graphs.cu + src/gpu/cuda/cuda_multi_gpu.cu + src/gpu/cuda/cuda_fp16_weights.cu + src/gpu/cuda/kernels/nnue_persistent.cu) +endif() # MCTS source files (hybrid search) Core files needed for all MCTS modes: - # stockfish_adapter: Interface between Stockfish and MCTS - position_classifier: @@ -265,10 +310,9 @@ if(USE_METAL AND METAL_CPP_AVAILABLE) endif() elseif(USE_CUDA AND CUDA_AVAILABLE) # CUDA GPU acceleration - set(CUDA_SOURCES src/gpu/cuda/cuda_backend.cu - src/gpu/cuda/kernels/nnue_kernels.cu) add_definitions(-DUSE_CUDA) message(STATUS "CUDA GPU acceleration: ENABLED") + message(STATUS " CUDA source files: ${CUDA_SOURCES}") else() # CPU fallback backend set(GPU_SOURCES ${GPU_SOURCES} src/gpu/cpu_backend.cpp) diff --git a/src/gpu/cuda/cuda_backend.cu b/src/gpu/cuda/cuda_backend.cu index f30374b4..b2e60310 100644 --- a/src/gpu/cuda/cuda_backend.cu +++ b/src/gpu/cuda/cuda_backend.cu @@ -15,6 +15,8 @@ #ifdef USE_CUDA #include "cuda_backend.h" +#include "cuda_memory.h" +#include "cuda_profiling.h" #include #include #include @@ -88,11 +90,11 @@ CUDABuffer::CUDABuffer(void *device_ptr, void *host_ptr, size_t size, CUDABuffer::~CUDABuffer() { if (device_ptr_) { if (unified_) { - cudaFree(device_ptr_); + CUDA::UnifiedMemoryManager::free_unified(device_ptr_); } else { cudaFree(device_ptr_); if (host_ptr_) { - cudaFreeHost(host_ptr_); + CUDA::PinnedMemoryManager::free_pinned(host_ptr_); } } } @@ -263,9 +265,11 @@ void CUDACommandEncoder::barrier() { cudaStreamSynchronize(stream_); } CUDABackend::CUDABackend() : device_id_(-1), compute_capability_major_(0), compute_capability_minor_(0), total_memory_(0), multiprocessor_count_(0), - unified_memory_supported_(false), default_stream_(nullptr), + unified_memory_supported_(false), tensor_cores_available_(false), + int8_tensor_cores_available_(false), default_stream_(nullptr), stream_index_(0), allocated_memory_(0), peak_memory_(0), - initialized_(false) {} + initialized_(false), use_cuda_graphs_(false), use_multi_gpu_(false), + use_persistent_kernels_(false), use_fp16_weights_(false) {} CUDABackend::~CUDABackend() { cleanup(); } @@ -333,6 +337,9 @@ bool CUDABackend::initialize() { cudaStreamCreate(¶llel_streams_[i]); } + // Detect architecture-specific features + detect_architecture_features(); + initialized_ = true; std::cout << "[CUDA Backend] Initialized: " << device_name_ << std::endl; @@ -345,10 +352,51 @@ bool CUDABackend::initialize() { << std::endl; std::cout << "[CUDA Backend] Unified Memory: " << (unified_memory_supported_ ? "Yes" : "No") << std::endl; + std::cout << "[CUDA Backend] Tensor Cores: " + << (tensor_cores_available_ ? "Yes" : "No") << std::endl; + if (tensor_cores_available_) { + std::cout << "[CUDA Backend] INT8 Tensor Cores: " + << (int8_tensor_cores_available_ ? "Yes" : "No") << std::endl; + } return true; } +void CUDABackend::detect_architecture_features() { + // Detect tensor core support + // Volta (SM 7.0) and later have FP16 tensor cores + tensor_cores_available_ = compute_capability_major_ >= 7; + + // Turing (SM 7.5) and later have INT8 tensor cores + this->int8_tensor_cores_available_ = (compute_capability_major_ > 7) || + (compute_capability_major_ == 7 && + compute_capability_minor_ >= 5); + + // Print architecture-specific information + std::string arch_name; + if (compute_capability_major_ == 6 && compute_capability_minor_ == 0) { + arch_name = "Pascal (GP100)"; + } else if (compute_capability_major_ == 6 && compute_capability_minor_ == 1) { + arch_name = "Pascal (GP10x)"; + } else if (compute_capability_major_ == 7 && compute_capability_minor_ == 0) { + arch_name = "Volta"; + } else if (compute_capability_major_ == 7 && compute_capability_minor_ == 5) { + arch_name = "Turing"; + } else if (compute_capability_major_ == 8 && compute_capability_minor_ == 0) { + arch_name = "Ampere (A100)"; + } else if (compute_capability_major_ == 8 && compute_capability_minor_ == 6) { + arch_name = "Ampere (GA10x)"; + } else if (compute_capability_major_ == 8 && compute_capability_minor_ == 9) { + arch_name = "Ada Lovelace"; + } else if (compute_capability_major_ == 9 && compute_capability_minor_ == 0) { + arch_name = "Hopper"; + } else { + arch_name = "Unknown"; + } + + std::cout << "[CUDA Backend] Architecture: " << arch_name << std::endl; +} + void CUDABackend::cleanup() { if (!initialized_) { return; @@ -462,11 +510,17 @@ std::unique_ptr CUDABackend::create_buffer(size_t size, MemoryMode mode, bool unified = false; if (mode == MemoryMode::Shared && unified_memory_supported_) { - // Use unified memory - cudaError_t err = cudaMallocManaged(&device_ptr, size); - if (err != cudaSuccess) { + // Use optimized unified memory with hints + device_ptr = CUDA::UnifiedMemoryManager::allocate_unified(size, device_id_); + if (!device_ptr) { return nullptr; } + + // For read-only buffers (like weights), use read-mostly hint + if (usage == BufferUsage::Static) { + cudaMemAdvise(device_ptr, size, cudaMemAdviseSetReadMostly, device_id_); + } + unified = true; } else { // Allocate device and host memory separately @@ -476,8 +530,9 @@ std::unique_ptr CUDABackend::create_buffer(size_t size, MemoryMode mode, } if (mode != MemoryMode::Private) { - err = cudaMallocHost(&host_ptr, size); - if (err != cudaSuccess) { + // Use pinned memory for faster transfers + host_ptr = CUDA::PinnedMemoryManager::allocate_pinned(size); + if (!host_ptr) { cudaFree(device_ptr); return nullptr; } diff --git a/src/gpu/cuda/cuda_backend.h b/src/gpu/cuda/cuda_backend.h index a7036018..2760100d 100644 --- a/src/gpu/cuda/cuda_backend.h +++ b/src/gpu/cuda/cuda_backend.h @@ -169,6 +169,23 @@ class CUDABackend : public Backend { int compute_capability_minor() const { return compute_capability_minor_; } size_t total_memory() const { return total_memory_; } int multiprocessor_count() const { return multiprocessor_count_; } + bool has_tensor_cores() const { return tensor_cores_available_; } + bool has_int8_tensor_cores() const { return int8_tensor_cores_available_; } + bool has_warp_shuffle() const { return compute_capability_major_ >= 3; } + bool has_cooperative_groups() const { return compute_capability_major_ >= 6; } + + // Advanced feature support + void enable_cuda_graphs(bool enable) { use_cuda_graphs_ = enable; } + bool is_cuda_graphs_enabled() const { return use_cuda_graphs_; } + + void enable_multi_gpu(bool enable) { use_multi_gpu_ = enable; } + bool is_multi_gpu_enabled() const { return use_multi_gpu_; } + + void enable_persistent_kernels(bool enable) { use_persistent_kernels_ = enable; } + bool is_persistent_kernels_enabled() const { return use_persistent_kernels_; } + + void enable_fp16_weights(bool enable) { use_fp16_weights_ = enable; } + bool is_fp16_weights_enabled() const { return use_fp16_weights_; } private: CUDABackend(); @@ -176,6 +193,7 @@ class CUDABackend : public Backend { bool initialize(); void cleanup(); + void detect_architecture_features(); int device_id_; std::string device_name_; @@ -184,6 +202,14 @@ class CUDABackend : public Backend { size_t total_memory_; int multiprocessor_count_; bool unified_memory_supported_; + bool tensor_cores_available_; + bool int8_tensor_cores_available_; + + // Feature flags + bool use_cuda_graphs_; + bool use_multi_gpu_; + bool use_persistent_kernels_; + bool use_fp16_weights_; cudaStream_t default_stream_; std::vector parallel_streams_; diff --git a/src/gpu/cuda/cuda_fp16_weights.cu b/src/gpu/cuda/cuda_fp16_weights.cu new file mode 100644 index 00000000..cfd8d64e --- /dev/null +++ b/src/gpu/cuda/cuda_fp16_weights.cu @@ -0,0 +1,125 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + FP16 Weight Storage Implementation +*/ + +#ifdef USE_CUDA + +#include "cuda_fp16_weights.h" +#include +#include +#include + +namespace MetalFish { +namespace GPU { +namespace CUDA { + +FP16WeightManager::~FP16WeightManager() { + clear_all(); +} + +half* FP16WeightManager::convert_and_store_weights( + const int16_t* int16_weights, size_t size, float scale) { + + // Allocate host memory for FP16 conversion + std::vector fp16_host(size); + + // Convert INT16 to FP16 + for (size_t i = 0; i < size; i++) { + float val = static_cast(int16_weights[i]) / scale; + fp16_host[i] = __float2half(val); + } + + // Allocate device memory + half* device_ptr = nullptr; + cudaError_t err = cudaMalloc(&device_ptr, size * sizeof(half)); + if (err != cudaSuccess) { + std::cerr << "[FP16 Weights] Failed to allocate device memory: " + << cudaGetErrorString(err) << std::endl; + return nullptr; + } + + // Copy to device + err = cudaMemcpy(device_ptr, fp16_host.data(), size * sizeof(half), + cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + std::cerr << "[FP16 Weights] Failed to copy to device: " + << cudaGetErrorString(err) << std::endl; + cudaFree(device_ptr); + return nullptr; + } + + total_memory_ += size * sizeof(half); + return device_ptr; +} + +half* FP16WeightManager::convert_and_store_biases( + const int32_t* int32_biases, size_t size, float scale) { + + // Allocate host memory for FP16 conversion + std::vector fp16_host(size); + + // Convert INT32 to FP16 + for (size_t i = 0; i < size; i++) { + float val = static_cast(int32_biases[i]) / scale; + fp16_host[i] = __float2half(val); + } + + // Allocate device memory + half* device_ptr = nullptr; + cudaError_t err = cudaMalloc(&device_ptr, size * sizeof(half)); + if (err != cudaSuccess) { + std::cerr << "[FP16 Biases] Failed to allocate device memory: " + << cudaGetErrorString(err) << std::endl; + return nullptr; + } + + // Copy to device + err = cudaMemcpy(device_ptr, fp16_host.data(), size * sizeof(half), + cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + std::cerr << "[FP16 Biases] Failed to copy to device: " + << cudaGetErrorString(err) << std::endl; + cudaFree(device_ptr); + return nullptr; + } + + total_memory_ += size * sizeof(half); + return device_ptr; +} + +half* FP16WeightManager::get_fp16_weights(const std::string& layer_name) { + auto it = weights_.find(layer_name); + return (it != weights_.end()) ? it->second.device_ptr : nullptr; +} + +half* FP16WeightManager::get_fp16_biases(const std::string& layer_name) { + auto it = biases_.find(layer_name); + return (it != biases_.end()) ? it->second.device_ptr : nullptr; +} + +void FP16WeightManager::clear_all() { + for (auto& [name, data] : weights_) { + if (data.device_ptr) { + cudaFree(data.device_ptr); + } + } + + for (auto& [name, data] : biases_) { + if (data.device_ptr) { + cudaFree(data.device_ptr); + } + } + + weights_.clear(); + biases_.clear(); + total_memory_ = 0; +} + +} // namespace CUDA +} // namespace GPU +} // namespace MetalFish + +#endif // USE_CUDA diff --git a/src/gpu/cuda/cuda_fp16_weights.h b/src/gpu/cuda/cuda_fp16_weights.h new file mode 100644 index 00000000..8daac2d3 --- /dev/null +++ b/src/gpu/cuda/cuda_fp16_weights.h @@ -0,0 +1,93 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + FP16 Weight Storage + + Provides FP16 weight storage and conversion for tensor core compatibility. +*/ + +#ifndef CUDA_FP16_WEIGHTS_H +#define CUDA_FP16_WEIGHTS_H + +#ifdef USE_CUDA + +#include +#include +#include +#include +#include +#include + +namespace MetalFish { +namespace GPU { +namespace CUDA { + +/** + * FP16 Weight Manager + * + * Manages conversion and storage of network weights in FP16 format + * for tensor core acceleration. + */ +class FP16WeightManager { +public: + FP16WeightManager() = default; + ~FP16WeightManager(); + + /** + * Convert and store weights in FP16 format + * @param int16_weights Original INT16 weights + * @param size Number of weight elements + * @param scale Scale factor for conversion + * @return Device pointer to FP16 weights + */ + half* convert_and_store_weights(const int16_t* int16_weights, + size_t size, float scale = 64.0f); + + /** + * Convert and store biases in FP16 format + * @param int32_biases Original INT32 biases + * @param size Number of bias elements + * @param scale Scale factor for conversion + * @return Device pointer to FP16 biases + */ + half* convert_and_store_biases(const int32_t* int32_biases, + size_t size, float scale = 64.0f); + + /** + * Get FP16 weights for a layer + */ + half* get_fp16_weights(const std::string& layer_name); + + /** + * Get FP16 biases for a layer + */ + half* get_fp16_biases(const std::string& layer_name); + + /** + * Free all FP16 weights + */ + void clear_all(); + + /** + * Get total memory used by FP16 weights + */ + size_t get_memory_usage() const { return total_memory_; } + +private: + struct WeightData { + half* device_ptr = nullptr; + size_t size = 0; + }; + + std::unordered_map weights_; + std::unordered_map biases_; + size_t total_memory_ = 0; +}; + +} // namespace CUDA +} // namespace GPU +} // namespace MetalFish + +#endif // USE_CUDA +#endif // CUDA_FP16_WEIGHTS_H diff --git a/src/gpu/cuda/cuda_graphs.cu b/src/gpu/cuda/cuda_graphs.cu new file mode 100644 index 00000000..ec0eb856 --- /dev/null +++ b/src/gpu/cuda/cuda_graphs.cu @@ -0,0 +1,132 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + CUDA Graphs Implementation +*/ + +#ifdef USE_CUDA + +#include "cuda_graphs.h" +#include + +namespace MetalFish { +namespace GPU { +namespace CUDA { + +GraphManager::~GraphManager() { + clear_all(); +} + +bool GraphManager::begin_capture(cudaStream_t stream, const std::string& name) { + if (has_graph(name)) { + std::cerr << "[CUDA Graphs] Graph '" << name << "' already exists" << std::endl; + return false; + } + + cudaError_t err = cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); + if (err != cudaSuccess) { + std::cerr << "[CUDA Graphs] Failed to begin capture: " + << cudaGetErrorString(err) << std::endl; + return false; + } + + current_capture_name_ = name; + return true; +} + +bool GraphManager::end_capture(cudaStream_t stream, const std::string& name) { + if (current_capture_name_ != name) { + std::cerr << "[CUDA Graphs] Capture name mismatch" << std::endl; + cudaStreamEndCapture(stream, nullptr); // Abort capture + return false; + } + + GraphData data; + cudaError_t err = cudaStreamEndCapture(stream, &data.graph); + if (err != cudaSuccess) { + std::cerr << "[CUDA Graphs] Failed to end capture: " + << cudaGetErrorString(err) << std::endl; + return false; + } + + // Get node count + cudaGraphGetNodes(data.graph, nullptr, &data.node_count); + + // Instantiate the graph for execution + err = cudaGraphInstantiate(&data.exec, data.graph, nullptr, nullptr, 0); + if (err != cudaSuccess) { + std::cerr << "[CUDA Graphs] Failed to instantiate graph: " + << cudaGetErrorString(err) << std::endl; + cudaGraphDestroy(data.graph); + return false; + } + + graphs_[name] = data; + current_capture_name_.clear(); + + std::cout << "[CUDA Graphs] Captured '" << name << "' with " + << data.node_count << " nodes" << std::endl; + return true; +} + +bool GraphManager::launch_graph(const std::string& name, cudaStream_t stream) { + auto it = graphs_.find(name); + if (it == graphs_.end()) { + std::cerr << "[CUDA Graphs] Graph '" << name << "' not found" << std::endl; + return false; + } + + cudaError_t err = cudaGraphLaunch(it->second.exec, stream); + if (err != cudaSuccess) { + std::cerr << "[CUDA Graphs] Failed to launch graph: " + << cudaGetErrorString(err) << std::endl; + return false; + } + + return true; +} + +bool GraphManager::has_graph(const std::string& name) const { + return graphs_.find(name) != graphs_.end(); +} + +void GraphManager::remove_graph(const std::string& name) { + auto it = graphs_.find(name); + if (it != graphs_.end()) { + if (it->second.exec) { + cudaGraphExecDestroy(it->second.exec); + } + if (it->second.graph) { + cudaGraphDestroy(it->second.graph); + } + graphs_.erase(it); + } +} + +void GraphManager::clear_all() { + for (auto& [name, data] : graphs_) { + if (data.exec) { + cudaGraphExecDestroy(data.exec); + } + if (data.graph) { + cudaGraphDestroy(data.graph); + } + } + graphs_.clear(); +} + +GraphManager::GraphStats GraphManager::get_stats() const { + GraphStats stats{0, 0}; + stats.num_graphs = graphs_.size(); + for (const auto& [name, data] : graphs_) { + stats.total_nodes += data.node_count; + } + return stats; +} + +} // namespace CUDA +} // namespace GPU +} // namespace MetalFish + +#endif // USE_CUDA diff --git a/src/gpu/cuda/cuda_graphs.h b/src/gpu/cuda/cuda_graphs.h new file mode 100644 index 00000000..69d0362d --- /dev/null +++ b/src/gpu/cuda/cuda_graphs.h @@ -0,0 +1,117 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + CUDA Graphs Support + + Implements CUDA graphs for reduced kernel launch overhead. + CUDA graphs capture a sequence of operations and replay them efficiently. +*/ + +#ifndef CUDA_GRAPHS_H +#define CUDA_GRAPHS_H + +#ifdef USE_CUDA + +#include +#include +#include +#include + +namespace MetalFish { +namespace GPU { +namespace CUDA { + +/** + * CUDA Graph Manager + * + * Captures and replays sequences of CUDA operations for improved performance. + * Particularly useful for repetitive evaluation patterns in NNUE. + */ +class GraphManager { +public: + GraphManager() = default; + ~GraphManager(); + + /** + * Begin graph capture on a stream + */ + bool begin_capture(cudaStream_t stream, const std::string& name); + + /** + * End graph capture and store the graph + */ + bool end_capture(cudaStream_t stream, const std::string& name); + + /** + * Launch a captured graph + */ + bool launch_graph(const std::string& name, cudaStream_t stream); + + /** + * Check if a graph exists + */ + bool has_graph(const std::string& name) const; + + /** + * Delete a graph + */ + void remove_graph(const std::string& name); + + /** + * Clear all graphs + */ + void clear_all(); + + /** + * Get graph statistics + */ + struct GraphStats { + size_t num_graphs; + size_t total_nodes; + }; + GraphStats get_stats() const; + +private: + struct GraphData { + cudaGraph_t graph = nullptr; + cudaGraphExec_t exec = nullptr; + size_t node_count = 0; + }; + + std::unordered_map graphs_; + std::string current_capture_name_; +}; + +/** + * RAII helper for graph capture + */ +class ScopedGraphCapture { +public: + ScopedGraphCapture(GraphManager& manager, cudaStream_t stream, + const std::string& name) + : manager_(manager), stream_(stream), name_(name), active_(false) { + active_ = manager_.begin_capture(stream_, name_); + } + + ~ScopedGraphCapture() { + if (active_) { + manager_.end_capture(stream_, name_); + } + } + + bool is_active() const { return active_; } + +private: + GraphManager& manager_; + cudaStream_t stream_; + std::string name_; + bool active_; +}; + +} // namespace CUDA +} // namespace GPU +} // namespace MetalFish + +#endif // USE_CUDA +#endif // CUDA_GRAPHS_H diff --git a/src/gpu/cuda/cuda_memory.cu b/src/gpu/cuda/cuda_memory.cu new file mode 100644 index 00000000..613fe09b --- /dev/null +++ b/src/gpu/cuda/cuda_memory.cu @@ -0,0 +1,439 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + CUDA Advanced Memory Management + + Optimized memory management including: + - Unified memory with hints and prefetching + - Pinned memory for faster transfers + - Double buffering for async operations + - Memory pool management +*/ + +#ifndef CUDA_MEMORY_CU +#define CUDA_MEMORY_CU + +#include +#include +#include +#include +#include + +namespace MetalFish { +namespace GPU { +namespace CUDA { + +// ============================================================================ +// Unified Memory Manager +// ============================================================================ + +class UnifiedMemoryManager { +public: + /** + * Allocate unified memory with optimal hints + */ + static void *allocate_unified(size_t size, int device_id) { + void *ptr = nullptr; + cudaError_t err = cudaMallocManaged(&ptr, size); + + if (err != cudaSuccess) { + std::cerr << "[CUDA Memory] Failed to allocate unified memory: " + << cudaGetErrorString(err) << std::endl; + return nullptr; + } + + // Set memory access hints for better performance + cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, device_id); + cudaMemAdvise(ptr, size, cudaMemAdviseSetAccessedBy, device_id); + cudaMemAdvise(ptr, size, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId); + + return ptr; + } + + /** + * Allocate unified memory with read-mostly hint + * Useful for weight buffers that are rarely modified + */ + static void *allocate_unified_readonly(size_t size, int device_id) { + void *ptr = allocate_unified(size, device_id); + + if (ptr) { + // Mark as read-mostly for better caching + cudaMemAdvise(ptr, size, cudaMemAdviseSetReadMostly, device_id); + } + + return ptr; + } + + /** + * Prefetch data to device asynchronously + */ + static void prefetch_to_device(void *ptr, size_t size, int device_id, + cudaStream_t stream = 0) { + cudaMemPrefetchAsync(ptr, size, device_id, stream); + } + + /** + * Prefetch data to CPU asynchronously + */ + static void prefetch_to_host(void *ptr, size_t size, + cudaStream_t stream = 0) { + cudaMemPrefetchAsync(ptr, size, cudaCpuDeviceId, stream); + } + + /** + * Free unified memory + */ + static void free_unified(void *ptr) { + if (ptr) { + cudaFree(ptr); + } + } +}; + +// ============================================================================ +// Pinned Memory Manager +// ============================================================================ + +class PinnedMemoryManager { +public: + /** + * Allocate pinned (page-locked) host memory + * Provides faster CPU-GPU transfers + */ + static void *allocate_pinned(size_t size) { + void *ptr = nullptr; + cudaError_t err = cudaMallocHost(&ptr, size); + + if (err != cudaSuccess) { + std::cerr << "[CUDA Memory] Failed to allocate pinned memory: " + << cudaGetErrorString(err) << std::endl; + return nullptr; + } + + return ptr; + } + + /** + * Register existing host memory as pinned + * Useful for making existing allocations DMA-capable + */ + static bool register_pinned(void *ptr, size_t size) { + cudaError_t err = cudaHostRegister(ptr, size, cudaHostRegisterDefault); + + if (err != cudaSuccess) { + std::cerr << "[CUDA Memory] Failed to register pinned memory: " + << cudaGetErrorString(err) << std::endl; + return false; + } + + return true; + } + + /** + * Unregister pinned memory + */ + static void unregister_pinned(void *ptr) { + if (ptr) { + cudaHostUnregister(ptr); + } + } + + /** + * Free pinned memory + */ + static void free_pinned(void *ptr) { + if (ptr) { + cudaFreeHost(ptr); + } + } +}; + +// ============================================================================ +// Double Buffer for Async Operations +// ============================================================================ + +template +class DoubleBuffer { +public: + DoubleBuffer(size_t size, int device_id) + : size_(size), device_id_(device_id), current_buffer_(0), + host_buffers_{nullptr, nullptr}, device_buffers_{nullptr, nullptr}, + compute_stream_(nullptr), copy_stream_(nullptr), valid_(false) { + + // Allocate two pinned host buffers + host_buffers_[0] = static_cast(PinnedMemoryManager::allocate_pinned(size * sizeof(T))); + if (!host_buffers_[0]) return; + + host_buffers_[1] = static_cast(PinnedMemoryManager::allocate_pinned(size * sizeof(T))); + if (!host_buffers_[1]) return; + + // Allocate device buffers + if (cudaMalloc(&device_buffers_[0], size * sizeof(T)) != cudaSuccess) return; + if (cudaMalloc(&device_buffers_[1], size * sizeof(T)) != cudaSuccess) return; + + // Create streams for concurrent operations + if (cudaStreamCreate(&compute_stream_) != cudaSuccess) return; + if (cudaStreamCreate(©_stream_) != cudaSuccess) return; + + valid_ = true; + } + + ~DoubleBuffer() { + // Free host buffers (check for nullptr in case construction failed partway) + if (host_buffers_[0]) PinnedMemoryManager::free_pinned(host_buffers_[0]); + if (host_buffers_[1]) PinnedMemoryManager::free_pinned(host_buffers_[1]); + + // Free device buffers + if (device_buffers_[0]) cudaFree(device_buffers_[0]); + if (device_buffers_[1]) cudaFree(device_buffers_[1]); + + // Destroy streams + if (compute_stream_) cudaStreamDestroy(compute_stream_); + if (copy_stream_) cudaStreamDestroy(copy_stream_); + } + + /** + * Get current host buffer for writing + */ + T *get_host_buffer() { + return host_buffers_[current_buffer_]; + } + + /** + * Get current device buffer for compute + */ + T *get_device_buffer() { + return device_buffers_[current_buffer_]; + } + + /** + * Swap buffers and initiate async transfer + * While computing on buffer N, prefetch buffer N+1 + */ + void swap_and_transfer() { + int next_buffer = 1 - current_buffer_; + + // Copy next buffer to device asynchronously + cudaMemcpyAsync(device_buffers_[next_buffer], + host_buffers_[next_buffer], + size_ * sizeof(T), + cudaMemcpyHostToDevice, + copy_stream_); + + // Swap for next iteration + current_buffer_ = next_buffer; + } + + /** + * Wait for all operations to complete + */ + void synchronize() { + cudaStreamSynchronize(compute_stream_); + cudaStreamSynchronize(copy_stream_); + } + + cudaStream_t get_compute_stream() { return compute_stream_; } + cudaStream_t get_copy_stream() { return copy_stream_; } + +private: + size_t size_; + int device_id_; + int current_buffer_; + + T *host_buffers_[2]; + T *device_buffers_[2]; + + cudaStream_t compute_stream_; + cudaStream_t copy_stream_; + bool valid_; +}; + +// ============================================================================ +// Memory Pool for Efficient Allocation +// ============================================================================ + +class MemoryPool { +public: + MemoryPool(size_t pool_size, int device_id) + : pool_size_(pool_size), device_id_(device_id), allocated_(0), pool_base_(nullptr) { + + // Allocate large contiguous block + cudaError_t err = cudaMalloc(&pool_base_, pool_size); + if (err != cudaSuccess) { + std::cerr << "[CUDA Memory Pool] Failed to allocate pool: " + << cudaGetErrorString(err) << std::endl; + pool_base_ = nullptr; + } + } + + ~MemoryPool() { + if (pool_base_) { + cudaFree(pool_base_); + } + } + + /** + * Allocate from pool (simple bump allocator) + */ + void *allocate(size_t size, size_t alignment = 256) { + std::lock_guard lock(mutex_); + + if (!pool_base_) return nullptr; + + // Align allocation + size_t aligned_offset = (allocated_ + alignment - 1) & ~(alignment - 1); + + if (aligned_offset + size > pool_size_) { + std::cerr << "[CUDA Memory Pool] Out of pool memory" << std::endl; + return nullptr; + } + + void *ptr = static_cast(pool_base_) + aligned_offset; + allocated_ = aligned_offset + size; + + return ptr; + } + + /** + * Reset pool (invalidates all previous allocations) + */ + void reset() { + std::lock_guard lock(mutex_); + allocated_ = 0; + } + + size_t get_allocated() const { return allocated_; } + size_t get_available() const { return pool_size_ - allocated_; } + +private: + void *pool_base_; + size_t pool_size_; + size_t allocated_; + int device_id_; + std::mutex mutex_; +}; + +// ============================================================================ +// Cache-Aligned Allocator +// ============================================================================ + +/** + * Allocate memory with specific cache line alignment + * Important for avoiding false sharing and optimizing cache usage + * Note: alignment must be a power of 2 + */ +class CacheAlignedAllocator { +public: + /** + * Allocate device memory aligned to cache line (128 bytes default) + * @param size Size to allocate in bytes + * @param alignment Alignment in bytes (must be power of 2, default 128) + * @return Aligned device pointer or nullptr on failure + */ + static void *allocate_aligned(size_t size, size_t alignment = 128) { + // Validate alignment is power of 2 + if (alignment == 0 || (alignment & (alignment - 1)) != 0) { + std::cerr << "[CUDA Memory] Alignment must be a power of 2" << std::endl; + return nullptr; + } + + // CUDA allocations are already 256-byte aligned, but we can ensure it + void *ptr = nullptr; + + // Calculate aligned size (alignment must be power of 2) + size_t aligned_size = (size + alignment - 1) & ~(alignment - 1); + + cudaError_t err = cudaMalloc(&ptr, aligned_size); + if (err != cudaSuccess) { + std::cerr << "[CUDA Memory] Failed to allocate aligned memory: " + << cudaGetErrorString(err) << std::endl; + return nullptr; + } + + return ptr; + } + + static void free_aligned(void *ptr) { + if (ptr) { + cudaFree(ptr); + } + } +}; + +// ============================================================================ +// Async Memory Operations Helper +// ============================================================================ + +class AsyncMemoryOps { +public: + /** + * Async memcpy with event synchronization + */ + static void copy_async_with_event(void *dst, const void *src, size_t size, + cudaMemcpyKind kind, cudaStream_t stream, + cudaEvent_t *completion_event = nullptr) { + cudaMemcpyAsync(dst, src, size, kind, stream); + + if (completion_event) { + cudaEventRecord(*completion_event, stream); + } + } + + /** + * Async memset + */ + static void memset_async(void *ptr, int value, size_t size, + cudaStream_t stream) { + cudaMemsetAsync(ptr, value, size, stream); + } + + /** + * 2D memcpy for efficient matrix transfers + */ + static void copy_2d_async(void *dst, size_t dpitch, + const void *src, size_t spitch, + size_t width, size_t height, + cudaMemcpyKind kind, cudaStream_t stream) { + cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream); + } +}; + +// ============================================================================ +// Memory Statistics +// ============================================================================ + +class MemoryStats { +public: + static void print_memory_info(int device_id) { + size_t free_mem, total_mem; + cudaMemGetInfo(&free_mem, &total_mem); + + size_t used_mem = total_mem - free_mem; + + std::cout << "[CUDA Memory Stats] Device " << device_id << std::endl; + std::cout << " Total: " << (total_mem / (1024 * 1024)) << " MB" << std::endl; + std::cout << " Used: " << (used_mem / (1024 * 1024)) << " MB" << std::endl; + std::cout << " Free: " << (free_mem / (1024 * 1024)) << " MB" << std::endl; + std::cout << " Utilization: " << (100.0 * used_mem / total_mem) << "%" << std::endl; + } + + static size_t get_free_memory() { + size_t free_mem, total_mem; + cudaMemGetInfo(&free_mem, &total_mem); + return free_mem; + } + + static size_t get_total_memory() { + size_t free_mem, total_mem; + cudaMemGetInfo(&free_mem, &total_mem); + return total_mem; + } +}; + +} // namespace CUDA +} // namespace GPU +} // namespace MetalFish + +#endif // CUDA_MEMORY_CU diff --git a/src/gpu/cuda/cuda_memory.h b/src/gpu/cuda/cuda_memory.h new file mode 100644 index 00000000..e340fa70 --- /dev/null +++ b/src/gpu/cuda/cuda_memory.h @@ -0,0 +1,183 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + CUDA Advanced Memory Management Header + + Interface for optimized memory management utilities. +*/ + +#ifndef CUDA_MEMORY_H +#define CUDA_MEMORY_H + +#include +#include +#include +#include +#include + +namespace MetalFish { +namespace GPU { +namespace CUDA { + +/** + * Unified Memory Manager + * + * Provides optimized unified memory allocation with hints + */ +class UnifiedMemoryManager { +public: + static void *allocate_unified(size_t size, int device_id); + static void *allocate_unified_readonly(size_t size, int device_id); + static void prefetch_to_device(void *ptr, size_t size, int device_id, + cudaStream_t stream = 0); + static void prefetch_to_host(void *ptr, size_t size, + cudaStream_t stream = 0); + static void free_unified(void *ptr); +}; + +/** + * Pinned Memory Manager + * + * Manages pinned (page-locked) host memory for faster transfers + */ +class PinnedMemoryManager { +public: + static void *allocate_pinned(size_t size); + static void free_pinned(void *ptr); +}; + +/** + * Double Buffer + * + * Implements double buffering for overlapping transfers and computation + */ +template +class DoubleBuffer { +public: + DoubleBuffer(size_t size, int device_id); + ~DoubleBuffer(); + + bool is_valid() const { return valid_; } + T *get_host_buffer(int index) const; + T *get_device_buffer(int index) const; + void swap_buffers(); + void transfer_to_device(int index, cudaStream_t stream); + void transfer_from_device(int index, cudaStream_t stream); + +private: + T *host_buffers_[2]; + T *device_buffers_[2]; + cudaStream_t streams_[2]; + size_t size_; + int current_index_; + bool valid_; +}; + +/** + * Memory Pool + * + * Simple memory pool allocator for reducing allocation overhead + */ +class MemoryPool { +public: + MemoryPool(size_t pool_size, int device_id); + ~MemoryPool(); + + void *allocate(size_t size); + void reset(); + size_t get_allocated() const { return allocated_; } + +private: + void *pool_base_; + size_t pool_size_; + size_t allocated_; + int device_id_; +}; + +/** + * Cache-Aligned Allocator + * + * Allocates memory with specified alignment for optimal cache performance + */ +class CacheAlignedAllocator { +public: + static void *allocate_aligned(size_t size, size_t alignment); + static void free_aligned(void *ptr); +}; + +// ============================================================================ +// Template Implementation for DoubleBuffer +// ============================================================================ + +template +DoubleBuffer::DoubleBuffer(size_t size, int device_id) + : size_(size), current_index_(0), + host_buffers_{nullptr, nullptr}, device_buffers_{nullptr, nullptr}, + streams_{nullptr, nullptr}, valid_(false) { + + // Allocate two pinned host buffers + host_buffers_[0] = static_cast(PinnedMemoryManager::allocate_pinned(size * sizeof(T))); + if (!host_buffers_[0]) return; + + host_buffers_[1] = static_cast(PinnedMemoryManager::allocate_pinned(size * sizeof(T))); + if (!host_buffers_[1]) return; + + // Allocate device buffers + if (cudaMalloc(&device_buffers_[0], size * sizeof(T)) != cudaSuccess) return; + if (cudaMalloc(&device_buffers_[1], size * sizeof(T)) != cudaSuccess) return; + + // Create streams for concurrent operations + if (cudaStreamCreate(&streams_[0]) != cudaSuccess) return; + if (cudaStreamCreate(&streams_[1]) != cudaSuccess) return; + + valid_ = true; +} + +template +DoubleBuffer::~DoubleBuffer() { + // Free host buffers (check for nullptr in case construction failed partway) + if (host_buffers_[0]) PinnedMemoryManager::free_pinned(host_buffers_[0]); + if (host_buffers_[1]) PinnedMemoryManager::free_pinned(host_buffers_[1]); + + // Free device buffers + if (device_buffers_[0]) cudaFree(device_buffers_[0]); + if (device_buffers_[1]) cudaFree(device_buffers_[1]); + + // Destroy streams + if (streams_[0]) cudaStreamDestroy(streams_[0]); + if (streams_[1]) cudaStreamDestroy(streams_[1]); +} + +template +T *DoubleBuffer::get_host_buffer(int index) const { + return host_buffers_[index]; +} + +template +T *DoubleBuffer::get_device_buffer(int index) const { + return device_buffers_[index]; +} + +template +void DoubleBuffer::swap_buffers() { + current_index_ = 1 - current_index_; +} + +template +void DoubleBuffer::transfer_to_device(int index, cudaStream_t stream) { + cudaMemcpyAsync(device_buffers_[index], host_buffers_[index], + size_ * sizeof(T), cudaMemcpyHostToDevice, stream); +} + +template +void DoubleBuffer::transfer_from_device(int index, cudaStream_t stream) { + cudaMemcpyAsync(host_buffers_[index], device_buffers_[index], + size_ * sizeof(T), cudaMemcpyDeviceToHost, stream); +} + +} // namespace CUDA +} // namespace GPU +} // namespace MetalFish + +#endif // CUDA_MEMORY_H diff --git a/src/gpu/cuda/cuda_multi_gpu.cu b/src/gpu/cuda/cuda_multi_gpu.cu new file mode 100644 index 00000000..f6e679b9 --- /dev/null +++ b/src/gpu/cuda/cuda_multi_gpu.cu @@ -0,0 +1,226 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + Multi-GPU Implementation +*/ + +#ifdef USE_CUDA + +#include "cuda_multi_gpu.h" +#include +#include + +namespace MetalFish { +namespace GPU { +namespace CUDA { + +MultiGPUManager::MultiGPUManager() : initialized_(false), original_device_(0) { + cudaGetDevice(&original_device_); +} + +MultiGPUManager::~MultiGPUManager() { + if (initialized_) { + cudaSetDevice(original_device_); + } +} + +bool MultiGPUManager::initialize(bool use_all) { + if (initialized_) { + return true; + } + + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (err != cudaSuccess || device_count == 0) { + std::cerr << "[Multi-GPU] No CUDA devices found" << std::endl; + return false; + } + + std::cout << "[Multi-GPU] Found " << device_count << " CUDA device(s)" << std::endl; + + // Collect GPU information + std::vector all_gpus; + for (int i = 0; i < device_count; i++) { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, i); + + GPUInfo info; + info.device_id = i; + info.name = prop.name; + info.compute_major = prop.major; + info.compute_minor = prop.minor; + info.total_memory = prop.totalGlobalMem; + info.multiprocessor_count = prop.multiProcessorCount; + info.has_tensor_cores = (prop.major >= 7); + info.has_peer_access = false; + + all_gpus.push_back(info); + + std::cout << "[Multi-GPU] GPU " << i << ": " << info.name + << " (SM " << info.compute_major << "." << info.compute_minor << ")" << std::endl; + } + + if (use_all) { + // Use all GPUs + gpu_info_ = all_gpus; + } else { + // Use only the best GPU + auto best_gpu = std::max_element(all_gpus.begin(), all_gpus.end(), + [](const GPUInfo& a, const GPUInfo& b) { + int score_a = a.compute_major * 100 + a.compute_minor; + int score_b = b.compute_major * 100 + b.compute_minor; + return score_a < score_b; + }); + gpu_info_.push_back(*best_gpu); + } + + initialized_ = true; + std::cout << "[Multi-GPU] Using " << gpu_info_.size() << " GPU(s)" << std::endl; + + return true; +} + +const GPUInfo& MultiGPUManager::get_gpu_info(int gpu_index) const { + return gpu_info_[gpu_index]; +} + +int MultiGPUManager::get_best_gpu() const { + if (gpu_info_.empty()) { + return 0; + } + + int best_idx = 0; + int best_score = gpu_info_[0].compute_major * 100 + gpu_info_[0].compute_minor; + + for (size_t i = 1; i < gpu_info_.size(); i++) { + int score = gpu_info_[i].compute_major * 100 + gpu_info_[i].compute_minor; + if (score > best_score) { + best_score = score; + best_idx = static_cast(i); + } + } + + return best_idx; +} + +bool MultiGPUManager::enable_peer_access() { + if (gpu_info_.size() < 2) { + return true; // Nothing to do with single GPU + } + + std::cout << "[Multi-GPU] Enabling peer-to-peer access..." << std::endl; + + for (size_t i = 0; i < gpu_info_.size(); i++) { + cudaSetDevice(gpu_info_[i].device_id); + + for (size_t j = 0; j < gpu_info_.size(); j++) { + if (i == j) continue; + + int can_access = 0; + cudaDeviceCanAccessPeer(&can_access, gpu_info_[i].device_id, + gpu_info_[j].device_id); + + if (can_access) { + cudaError_t err = cudaDeviceEnablePeerAccess(gpu_info_[j].device_id, 0); + if (err == cudaSuccess) { + gpu_info_[i].has_peer_access = true; + std::cout << "[Multi-GPU] Enabled P2P: GPU " << i << " -> GPU " << j << std::endl; + } else if (err != cudaErrorPeerAccessAlreadyEnabled) { + std::cerr << "[Multi-GPU] Failed to enable P2P: " + << cudaGetErrorString(err) << std::endl; + } else { + // Already enabled, clear the error + cudaGetLastError(); + } + } + } + } + + cudaSetDevice(original_device_); + return true; +} + +std::vector MultiGPUManager::distribute_batch(int total_batch_size) const { + std::vector batch_sizes(gpu_info_.size()); + + if (gpu_info_.size() == 1) { + batch_sizes[0] = total_batch_size; + return batch_sizes; + } + + // Distribute based on relative compute capability + std::vector scores; + int total_score = 0; + + for (const auto& info : gpu_info_) { + int score = info.multiprocessor_count * (info.compute_major * 10 + info.compute_minor); + scores.push_back(score); + total_score += score; + } + + // Distribute proportionally + int remaining = total_batch_size; + for (size_t i = 0; i < gpu_info_.size(); i++) { + if (i == gpu_info_.size() - 1) { + // Last GPU gets all remaining + batch_sizes[i] = remaining; + } else { + int size = (total_batch_size * scores[i]) / total_score; + batch_sizes[i] = size; + remaining -= size; + } + } + + return batch_sizes; +} + +bool MultiGPUManager::set_device(int gpu_index) { + if (gpu_index < 0 || gpu_index >= static_cast(gpu_info_.size())) { + return false; + } + + cudaError_t err = cudaSetDevice(gpu_info_[gpu_index].device_id); + return err == cudaSuccess; +} + +int MultiGPUManager::get_current_device() const { + int device; + cudaGetDevice(&device); + + // Find index in our list + for (size_t i = 0; i < gpu_info_.size(); i++) { + if (gpu_info_[i].device_id == device) { + return static_cast(i); + } + } + + return 0; +} + +void MultiGPUManager::synchronize_all() { + int current_device; + cudaGetDevice(¤t_device); + + for (const auto& info : gpu_info_) { + cudaSetDevice(info.device_id); + cudaDeviceSynchronize(); + } + + cudaSetDevice(current_device); +} + +ScopedDevice::ScopedDevice(int device_id) : saved_device_(0) { + cudaGetDevice(&saved_device_); + cudaSetDevice(device_id); +} + +ScopedDevice::~ScopedDevice() { + cudaSetDevice(saved_device_); +} + +} // namespace CUDA +} // namespace GPU +} // namespace MetalFish + +#endif // USE_CUDA diff --git a/src/gpu/cuda/cuda_multi_gpu.h b/src/gpu/cuda/cuda_multi_gpu.h new file mode 100644 index 00000000..0b0ffc2a --- /dev/null +++ b/src/gpu/cuda/cuda_multi_gpu.h @@ -0,0 +1,123 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + Multi-GPU Support + + Enables batch distribution across multiple NVIDIA GPUs. +*/ + +#ifndef CUDA_MULTI_GPU_H +#define CUDA_MULTI_GPU_H + +#ifdef USE_CUDA + +#include +#include +#include + +namespace MetalFish { +namespace GPU { +namespace CUDA { + +/** + * GPU Device Information + */ +struct GPUInfo { + int device_id; + std::string name; + int compute_major; + int compute_minor; + size_t total_memory; + int multiprocessor_count; + bool has_tensor_cores; + bool has_peer_access; +}; + +/** + * Multi-GPU Manager + * + * Manages multiple GPUs for parallel batch processing. + */ +class MultiGPUManager { +public: + MultiGPUManager(); + ~MultiGPUManager(); + + /** + * Initialize multi-GPU support + * @param use_all If true, use all available GPUs. Otherwise, use best GPU only. + * @return true if at least one GPU is available + */ + bool initialize(bool use_all = false); + + /** + * Get number of active GPUs + */ + int get_num_gpus() const { return static_cast(gpu_info_.size()); } + + /** + * Get GPU information + */ + const GPUInfo& get_gpu_info(int gpu_index) const; + + /** + * Get best GPU (highest compute capability) + */ + int get_best_gpu() const; + + /** + * Enable peer-to-peer access between GPUs + */ + bool enable_peer_access(); + + /** + * Distribute batch across GPUs + * Returns the batch size for each GPU + */ + std::vector distribute_batch(int total_batch_size) const; + + /** + * Set current device + */ + bool set_device(int gpu_index); + + /** + * Get current device + */ + int get_current_device() const; + + /** + * Synchronize all GPUs + */ + void synchronize_all(); + + /** + * Check if multi-GPU is enabled + */ + bool is_multi_gpu_enabled() const { return gpu_info_.size() > 1; } + +private: + std::vector gpu_info_; + bool initialized_; + int original_device_; +}; + +/** + * RAII helper to switch GPU device temporarily + */ +class ScopedDevice { +public: + ScopedDevice(int device_id); + ~ScopedDevice(); + +private: + int saved_device_; +}; + +} // namespace CUDA +} // namespace GPU +} // namespace MetalFish + +#endif // USE_CUDA +#endif // CUDA_MULTI_GPU_H diff --git a/src/gpu/cuda/cuda_profiling.h b/src/gpu/cuda/cuda_profiling.h new file mode 100644 index 00000000..93c14824 --- /dev/null +++ b/src/gpu/cuda/cuda_profiling.h @@ -0,0 +1,440 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + CUDA Profiling Infrastructure + + Profiling utilities including: + - NVTX markers for Nsight profiling + - Kernel timing + - Occupancy calculator + - Performance metrics collection +*/ + +#ifndef CUDA_PROFILING_H +#define CUDA_PROFILING_H + +#include +#include +#include +#include +#include +#include + +// NVTX profiling support (optional) +#ifdef USE_NVTX +#include +#endif + +namespace MetalFish { +namespace GPU { +namespace CUDA { + +// ============================================================================ +// NVTX Markers (for Nsight profiling) +// ============================================================================ + +class NVTXMarker { +public: +#ifdef USE_NVTX + NVTXMarker(const char *name, uint32_t color = 0xFF00FF00) { + nvtxEventAttributes_t eventAttrib = {0}; + eventAttrib.version = NVTX_VERSION; + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + eventAttrib.colorType = NVTX_COLOR_ARGB; + eventAttrib.color = color; + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; + eventAttrib.message.ascii = name; + + nvtxRangePushEx(&eventAttrib); + } + + ~NVTXMarker() { + nvtxRangePop(); + } +#else + NVTXMarker(const char *, uint32_t = 0) {} + ~NVTXMarker() {} +#endif +}; + +// Convenience macro +#define NVTX_RANGE(name) NVTXMarker _nvtx_marker(name) +#define NVTX_RANGE_COLOR(name, color) NVTXMarker _nvtx_marker(name, color) + +// ============================================================================ +// Kernel Timer +// ============================================================================ + +class KernelTimer { +public: + KernelTimer(const std::string &name, cudaStream_t stream = 0) + : name_(name), stream_(stream) { + cudaEventCreate(&start_event_); + cudaEventCreate(&stop_event_); + cudaEventRecord(start_event_, stream_); + } + + ~KernelTimer() { + cudaEventRecord(stop_event_, stream_); + cudaEventSynchronize(stop_event_); + + float ms = 0.0f; + cudaEventElapsedTime(&ms, start_event_, stop_event_); + + // Record timing with thread safety + { + std::lock_guard lock(timings_mutex_); + timings_[name_].push_back(ms); + } + + cudaEventDestroy(start_event_); + cudaEventDestroy(stop_event_); + } + + // Get average time for a kernel + static float get_average_time(const std::string &name) { + std::lock_guard lock(timings_mutex_); + auto it = timings_.find(name); + if (it == timings_.end() || it->second.empty()) { + return 0.0f; + } + + float sum = 0.0f; + for (float t : it->second) { + sum += t; + } + return sum / it->second.size(); + } + + // Print all timing statistics + static void print_stats() { + std::cout << "\n[CUDA Kernel Timing Statistics]" << std::endl; + std::cout << "======================================" << std::endl; + + for (const auto &[name, times] : timings_) { + if (times.empty()) continue; + + float sum = 0.0f, min_time = times[0], max_time = times[0]; + for (float t : times) { + sum += t; + min_time = std::min(min_time, t); + max_time = std::max(max_time, t); + } + float avg = sum / times.size(); + + std::cout << name << ":" << std::endl; + std::cout << " Calls: " << times.size() << std::endl; + std::cout << " Average: " << avg << " ms" << std::endl; + std::cout << " Min: " << min_time << " ms" << std::endl; + std::cout << " Max: " << max_time << " ms" << std::endl; + std::cout << " Total: " << sum << " ms" << std::endl; + } + } + + // Reset all timings + static void reset() { + timings_.clear(); + } + +private: + std::string name_; + cudaStream_t stream_; + cudaEvent_t start_event_; + cudaEvent_t stop_event_; + + static std::map> timings_; +}; + +// Convenience macro +#define TIME_KERNEL(name, stream) KernelTimer _kernel_timer(name, stream) + +// ============================================================================ +// Occupancy Calculator +// ============================================================================ + +class OccupancyCalculator { +public: + /** + * Calculate theoretical occupancy for a kernel + */ + static float calculate_occupancy(const void *kernel, int block_size, + size_t dynamic_smem_size = 0) { + int min_grid_size, optimal_block_size; + + cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &optimal_block_size, + kernel, dynamic_smem_size, 0); + + // Get device properties + cudaDeviceProp prop; + int device; + cudaGetDevice(&device); + cudaGetDeviceProperties(&prop, device); + + // Calculate occupancy + int max_active_blocks; + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, kernel, + block_size, dynamic_smem_size); + + float occupancy = (max_active_blocks * block_size / + static_cast(prop.maxThreadsPerMultiProcessor)); + + return occupancy; + } + + /** + * Print occupancy information for a kernel + */ + static void print_occupancy_info(const std::string &name, const void *kernel, + int block_size, size_t dynamic_smem_size = 0) { + float occupancy = calculate_occupancy(kernel, block_size, dynamic_smem_size); + + cudaFuncAttributes attr; + cudaFuncGetAttributes(&attr, kernel); + + std::cout << "\n[Occupancy Info: " << name << "]" << std::endl; + std::cout << " Block Size: " << block_size << std::endl; + std::cout << " Registers/Thread: " << attr.numRegs << std::endl; + std::cout << " Shared Mem: " << (attr.sharedSizeBytes + dynamic_smem_size) << " bytes" << std::endl; + std::cout << " Occupancy: " << (occupancy * 100.0f) << "%" << std::endl; + + // Suggest optimal block size + int min_grid_size, optimal_block_size; + cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &optimal_block_size, + kernel, dynamic_smem_size, 0); + std::cout << " Optimal Block Size: " << optimal_block_size << std::endl; + } + + /** + * Auto-tune block size for best occupancy + */ + static int find_optimal_block_size(const void *kernel, + size_t dynamic_smem_size = 0) { + int min_grid_size, optimal_block_size; + cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &optimal_block_size, + kernel, dynamic_smem_size, 0); + return optimal_block_size; + } +}; + +// ============================================================================ +// Performance Metrics Collector +// ============================================================================ + +class PerformanceMetrics { +public: + struct Metrics { + float kernel_time_ms = 0.0f; + float memory_throughput_gbps = 0.0f; + float compute_throughput_gflops = 0.0f; + float occupancy = 0.0f; + size_t memory_transferred = 0; + }; + + /** + * Measure kernel performance + */ + static Metrics measure_kernel(const std::string &name, + std::function kernel_launch, + size_t memory_transferred = 0, + size_t flops = 0) { + Metrics m; + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // Warm-up + kernel_launch(); + cudaDeviceSynchronize(); + + // Measure + cudaEventRecord(start); + kernel_launch(); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&m.kernel_time_ms, start, stop); + + // Calculate throughput + if (memory_transferred > 0 && m.kernel_time_ms > 0) { + float seconds = m.kernel_time_ms / 1000.0f; + m.memory_throughput_gbps = (memory_transferred / 1e9) / seconds; + } + + if (flops > 0 && m.kernel_time_ms > 0) { + float seconds = m.kernel_time_ms / 1000.0f; + m.compute_throughput_gflops = (flops / 1e9) / seconds; + } + + m.memory_transferred = memory_transferred; + + cudaEventDestroy(start); + cudaEventDestroy(stop); + + // Store metrics + metrics_[name] = m; + + return m; + } + + /** + * Print performance report + */ + static void print_report() { + std::cout << "\n[CUDA Performance Report]" << std::endl; + std::cout << "================================================" << std::endl; + + for (const auto &[name, m] : metrics_) { + std::cout << name << ":" << std::endl; + std::cout << " Time: " << m.kernel_time_ms << " ms" << std::endl; + if (m.memory_throughput_gbps > 0) { + std::cout << " Memory Throughput: " << m.memory_throughput_gbps << " GB/s" << std::endl; + } + if (m.compute_throughput_gflops > 0) { + std::cout << " Compute Throughput: " << m.compute_throughput_gflops << " GFLOPS" << std::endl; + } + if (m.occupancy > 0) { + std::cout << " Occupancy: " << (m.occupancy * 100.0f) << "%" << std::endl; + } + std::cout << std::endl; + } + } + + static void reset() { + metrics_.clear(); + } + +private: + static std::map metrics_; +}; + +// ============================================================================ +// CPU Timer (for comparison) +// ============================================================================ + +class CPUTimer { +public: + CPUTimer(const std::string &name) + : name_(name), start_(std::chrono::high_resolution_clock::now()) {} + + ~CPUTimer() { + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - start_); + + std::cout << "[CPU Timer] " << name_ << ": " + << (duration.count() / 1000.0) << " ms" << std::endl; + } + +private: + std::string name_; + std::chrono::high_resolution_clock::time_point start_; +}; + +// ============================================================================ +// Bandwidth Tester +// ============================================================================ + +class BandwidthTester { +public: + /** + * Measure host to device bandwidth + */ + static float measure_h2d_bandwidth(size_t size) { + void *h_data, *d_data; + cudaMallocHost(&h_data, size); + cudaMalloc(&d_data, size); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + float ms; + cudaEventElapsedTime(&ms, start, stop); + + float bandwidth_gbps = (size / 1e9) / (ms / 1000.0f); + + cudaFreeHost(h_data); + cudaFree(d_data); + cudaEventDestroy(start); + cudaEventDestroy(stop); + + return bandwidth_gbps; + } + + /** + * Measure device to host bandwidth + */ + static float measure_d2h_bandwidth(size_t size) { + void *h_data, *d_data; + cudaMallocHost(&h_data, size); + cudaMalloc(&d_data, size); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + float ms; + cudaEventElapsedTime(&ms, start, stop); + + float bandwidth_gbps = (size / 1e9) / (ms / 1000.0f); + + cudaFreeHost(h_data); + cudaFree(d_data); + cudaEventDestroy(start); + cudaEventDestroy(stop); + + return bandwidth_gbps; + } + + /** + * Print bandwidth test results + */ + static void print_bandwidth_tests() { + std::cout << "\n[CUDA Bandwidth Tests]" << std::endl; + std::cout << "================================" << std::endl; + + std::vector sizes = { + 1 * 1024 * 1024, // 1 MB + 16 * 1024 * 1024, // 16 MB + 64 * 1024 * 1024, // 64 MB + 256 * 1024 * 1024 // 256 MB + }; + + for (size_t size : sizes) { + float h2d = measure_h2d_bandwidth(size); + float d2h = measure_d2h_bandwidth(size); + + std::cout << "Size: " << (size / (1024 * 1024)) << " MB" << std::endl; + std::cout << " H2D: " << h2d << " GB/s" << std::endl; + std::cout << " D2H: " << d2h << " GB/s" << std::endl; + } + } +}; + +} // namespace CUDA +} // namespace GPU +} // namespace MetalFish + +// Initialize static members +namespace MetalFish { +namespace GPU { +namespace CUDA { +std::map> KernelTimer::timings_; +std::mutex KernelTimer::timings_mutex_; +std::map PerformanceMetrics::metrics_; +} // namespace CUDA +} // namespace GPU +} // namespace MetalFish + +#endif // CUDA_PROFILING_H diff --git a/src/gpu/cuda/kernels/nnue_persistent.cu b/src/gpu/cuda/kernels/nnue_persistent.cu new file mode 100644 index 00000000..a0592568 --- /dev/null +++ b/src/gpu/cuda/kernels/nnue_persistent.cu @@ -0,0 +1,203 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + Persistent Kernels for Small Batches + + Implements persistent kernels that stay resident on the GPU, + reducing launch overhead for small batch evaluations. +*/ + +#ifndef NNUE_PERSISTENT_KERNELS_CU +#define NNUE_PERSISTENT_KERNELS_CU + +#ifdef USE_CUDA + +#include +#include +#include + +namespace cg = cooperative_groups; + +using weight_t = int16_t; +using layer_weight_t = int8_t; +using accumulator_t = int32_t; + +constexpr int FC0_OUT = 15; +constexpr int FC1_OUT = 32; +constexpr int WEIGHT_SCALE_BITS = 6; +constexpr int OUTPUT_SCALE = 16; + +// ============================================================================ +// Work Queue for Persistent Kernels +// ============================================================================ + +/** + * Work item for NNUE evaluation + */ +struct NNUEWorkItem { + const accumulator_t *accumulators; + int32_t *output; + int hidden_dim; + bool valid; +}; + +/** + * Persistent kernel for small batch NNUE evaluation + * Stays resident and processes work items as they arrive + */ +__global__ void persistent_nnue_evaluator( + const layer_weight_t *fc0_weights, + const int32_t *fc0_biases, + const layer_weight_t *fc1_weights, + const int32_t *fc1_biases, + const layer_weight_t *fc2_weights, + const int32_t *fc2_biases, + NNUEWorkItem *work_queue, + volatile int *queue_head, + volatile int *queue_tail, + int max_queue_size, + volatile bool *shutdown_flag) { + + __shared__ int8_t fc0_sqr[2 * 16]; + __shared__ int8_t fc0_linear[2]; + __shared__ int8_t fc1_out[32]; + + auto grid = cg::this_grid(); + int work_idx = blockIdx.x; + + while (true) { + // Check for shutdown + if (*shutdown_flag) { + break; + } + + // Try to get work + if (*queue_tail <= *queue_head) { + // No work available, wait briefly + // Use __nanosleep on SM 7.0+, busy-wait on older GPUs +#if __CUDA_ARCH__ >= 700 + __nanosleep(1000); // Sleep 1 microsecond +#else + // Busy-wait for compatibility with older GPUs + for (int i = 0; i < 100; i++) { + __threadfence(); + } +#endif + continue; + } + + // Get work item atomically + int item_idx = atomicAdd(const_cast(queue_head), 1); + if (item_idx >= *queue_tail) { + // Missed it, try again + continue; + } + + item_idx = item_idx % max_queue_size; + NNUEWorkItem work = work_queue[item_idx]; + + if (!work.valid) { + continue; + } + + // Process the work item + const accumulator_t *white_acc = work.accumulators; + const accumulator_t *black_acc = white_acc + work.hidden_dim; + + // FC0 layer - simplified version for persistent kernel + int tid = threadIdx.x; + + // Process each perspective + for (int p = 0; p < 2; p++) { + const accumulator_t *acc = (p == 0) ? white_acc : black_acc; + + for (int out = tid; out <= FC0_OUT; out += blockDim.x) { + int32_t sum = fc0_biases[out]; + + for (int i = 0; i < work.hidden_dim; i++) { + int16_t val = static_cast(acc[i] >> WEIGHT_SCALE_BITS); + int8_t clipped = static_cast(max(0, min(127, static_cast(val)))); + sum += clipped * fc0_weights[i * (FC0_OUT + 1) + out]; + } + + int16_t result = static_cast(sum >> WEIGHT_SCALE_BITS); + if (out < FC0_OUT) { + int clamped = max(0, min(127, static_cast(result))); + fc0_sqr[p * FC0_OUT + out] = static_cast((clamped * clamped) >> 7); + } else { + fc0_linear[p] = static_cast(max(0, min(127, static_cast(result)))); + } + } + } + __syncthreads(); + + // FC1 layer + if (tid < FC1_OUT) { + int32_t sum = fc1_biases[tid]; + for (int i = 0; i < 2 * FC0_OUT; i++) { + sum += fc0_sqr[i] * fc1_weights[i * FC1_OUT + tid]; + } + fc1_out[tid] = static_cast( + max(0, min(127, static_cast(sum >> WEIGHT_SCALE_BITS)))); + } + __syncthreads(); + + // FC2 layer with skip connection + if (tid == 0) { + int32_t sum = fc2_biases[0]; + for (int i = 0; i < FC1_OUT; i++) { + sum += fc1_out[i] * fc2_weights[i]; + } + + int32_t skip_val = ((fc0_linear[0] + fc0_linear[1]) * 600 * OUTPUT_SCALE) / + (2 * 127 * (1 << WEIGHT_SCALE_BITS)); + *work.output = sum + skip_val; + } + + // Mark work as complete + work_queue[item_idx].valid = false; + __syncthreads(); + } +} + +// ============================================================================ +// Host Interface +// ============================================================================ + +extern "C" { + +/** + * Launch persistent kernel + * This kernel stays resident and processes work from a queue + */ +void cuda_launch_persistent_evaluator( + const layer_weight_t *fc0_weights, + const int32_t *fc0_biases, + const layer_weight_t *fc1_weights, + const int32_t *fc1_biases, + const layer_weight_t *fc2_weights, + const int32_t *fc2_biases, + NNUEWorkItem *work_queue, + volatile int *queue_head, + volatile int *queue_tail, + int max_queue_size, + volatile bool *shutdown_flag, + cudaStream_t stream) { + + // Launch with moderate block size + dim3 block(128); + dim3 grid(4); // 4 blocks for better latency hiding + + persistent_nnue_evaluator<<>>( + fc0_weights, fc0_biases, + fc1_weights, fc1_biases, + fc2_weights, fc2_biases, + work_queue, queue_head, queue_tail, + max_queue_size, shutdown_flag); +} + +} // extern "C" + +#endif // USE_CUDA +#endif // NNUE_PERSISTENT_KERNELS_CU diff --git a/src/gpu/cuda/kernels/nnue_persistent.h b/src/gpu/cuda/kernels/nnue_persistent.h new file mode 100644 index 00000000..1e00acf6 --- /dev/null +++ b/src/gpu/cuda/kernels/nnue_persistent.h @@ -0,0 +1,51 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + Persistent Kernels Header +*/ + +#ifndef NNUE_PERSISTENT_KERNELS_H +#define NNUE_PERSISTENT_KERNELS_H + +#ifdef USE_CUDA + +#include +#include + +using layer_weight_t = int8_t; +using accumulator_t = int32_t; + +/** + * Work item for NNUE evaluation + */ +struct NNUEWorkItem { + const accumulator_t *accumulators; + int32_t *output; + int hidden_dim; + bool valid; +}; + +extern "C" { + +/** + * Launch persistent kernel for small batch processing + */ +void cuda_launch_persistent_evaluator( + const layer_weight_t *fc0_weights, + const int32_t *fc0_biases, + const layer_weight_t *fc1_weights, + const int32_t *fc1_biases, + const layer_weight_t *fc2_weights, + const int32_t *fc2_biases, + NNUEWorkItem *work_queue, + volatile int *queue_head, + volatile int *queue_tail, + int max_queue_size, + volatile bool *shutdown_flag, + cudaStream_t stream); + +} // extern "C" + +#endif // USE_CUDA +#endif // NNUE_PERSISTENT_KERNELS_H diff --git a/src/gpu/cuda/kernels/nnue_simd.cu b/src/gpu/cuda/kernels/nnue_simd.cu new file mode 100644 index 00000000..f03f635e --- /dev/null +++ b/src/gpu/cuda/kernels/nnue_simd.cu @@ -0,0 +1,505 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + CUDA NNUE SIMD Kernels - Warp-Optimized + + Advanced CUDA kernels using warp-level primitives for maximum performance. + Optimized for Volta and later architectures with independent thread scheduling. +*/ + +#ifndef NNUE_CUDA_SIMD_CU +#define NNUE_CUDA_SIMD_CU + +#include +#include +#include + +// Cooperative groups for flexible thread synchronization +#include +namespace cg = cooperative_groups; + +// ============================================================================ +// Architecture Constants +// ============================================================================ + +constexpr int FT_DIM_BIG = 1024; +constexpr int FT_DIM_SMALL = 128; +constexpr int FC0_OUT = 15; +constexpr int FC1_OUT = 32; +constexpr int WEIGHT_SCALE_BITS = 6; +constexpr int OUTPUT_SCALE = 16; +constexpr int HALFKA_DIMS = 45056; + +using weight_t = int16_t; +using layer_weight_t = int8_t; +using accumulator_t = int32_t; + +// ============================================================================ +// Warp-Level Reduction Primitives +// ============================================================================ + +/** + * Warp-level sum reduction using shuffle operations + * Much faster than shared memory reduction + */ +template +__device__ __forceinline__ T warp_reduce_sum(T val) { +#pragma unroll + for (int offset = 16; offset > 0; offset /= 2) { + val += __shfl_down_sync(0xffffffff, val, offset); + } + return val; +} + +/** + * Block-level sum reduction combining warp reductions + */ +template +__device__ __forceinline__ T block_reduce_sum(T val) { + static __shared__ T shared[32]; // One element per warp + + int lane = threadIdx.x % 32; + int wid = threadIdx.x / 32; + + // Reduce within warp + val = warp_reduce_sum(val); + + // Write reduced value to shared memory + if (lane == 0) { + shared[wid] = val; + } + __syncthreads(); + + // First warp reduces across warps + if (wid == 0) { + val = (lane < blockDim.x / 32) ? shared[lane] : 0; + val = warp_reduce_sum(val); + } + + return val; +} + +/** + * Warp-level max reduction using shuffle operations + */ +template +__device__ __forceinline__ T warp_reduce_max(T val) { +#pragma unroll + for (int offset = 16; offset > 0; offset /= 2) { + val = max(val, __shfl_down_sync(0xffffffff, val, offset)); + } + return val; +} + +// ============================================================================ +// Activation Functions +// ============================================================================ + +__device__ __forceinline__ int8_t clipped_relu(int16_t x) { + return static_cast(max(0, min(127, static_cast(x)))); +} + +__device__ __forceinline__ int8_t sqr_clipped_relu(int16_t x) { + int clamped = max(0, min(127, static_cast(x))); + return static_cast((clamped * clamped) >> 7); +} + +// ============================================================================ +// Feature Extraction with Ballot Sync +// ============================================================================ + +/** + * Extract HalfKA features using warp ballot for efficient bitboard processing + * Uses __ballot_sync to find active lanes with pieces + */ +__global__ void extract_halfka_features_simd( + const uint64_t *__restrict__ piece_bitboards, + const uint8_t *__restrict__ king_squares, + int32_t *__restrict__ white_features, + int32_t *__restrict__ black_features, + uint32_t *__restrict__ feature_counts, + int batch_size, int max_features) { + + int pos_idx = blockIdx.x; + if (pos_idx >= batch_size) return; + + int lane = threadIdx.x % 32; + int warp_id = threadIdx.x / 32; + + __shared__ int white_count_shared; + __shared__ int black_count_shared; + + if (threadIdx.x == 0) { + white_count_shared = 0; + black_count_shared = 0; + } + __syncthreads(); + + int white_ksq = king_squares[pos_idx * 2]; + int black_ksq = king_squares[pos_idx * 2 + 1]; + + // Each warp processes a subset of piece types + int color = warp_id / 3; + int pt = (warp_id % 3) * 2 + 1; + + if (color < 2 && pt <= 6) { + uint64_t bb = piece_bitboards[pos_idx * 14 + color * 7 + pt]; + + // Each lane processes potential squares + int sq_base = lane * 2; + for (int sq_off = 0; sq_off < 2; sq_off++) { + int sq = sq_base + sq_off; + if (sq < 64 && (bb & (1ULL << sq))) { + // White perspective + int oriented_ksq_w = white_ksq ^ ((white_ksq & 4) ? 7 : 0); + int oriented_sq_w = sq ^ ((white_ksq & 4) ? 7 : 0); + int piece_idx_w = (pt - 1) + (color != 0 ? 6 : 0); + int white_feat = oriented_ksq_w * 640 + piece_idx_w * 64 + oriented_sq_w; + + if (white_feat >= 0 && white_feat < HALFKA_DIMS) { + int idx = atomicAdd(&white_count_shared, 1); + if (idx < max_features) { + white_features[pos_idx * max_features + idx] = white_feat; + } + } + + // Black perspective + int black_ksq_mir = black_ksq ^ 56; + int oriented_ksq_b = black_ksq_mir ^ ((black_ksq_mir & 4) ? 7 : 0); + int sq_mir = sq ^ 56; + int oriented_sq_b = sq_mir ^ ((black_ksq_mir & 4) ? 7 : 0); + int piece_idx_b = (pt - 1) + ((color ^ 1) != 0 ? 6 : 0); + int black_feat = oriented_ksq_b * 640 + piece_idx_b * 64 + oriented_sq_b; + + if (black_feat >= 0 && black_feat < HALFKA_DIMS) { + int idx = atomicAdd(&black_count_shared, 1); + if (idx < max_features) { + black_features[pos_idx * max_features + idx] = black_feat; + } + } + } + } + } + __syncthreads(); + + if (threadIdx.x == 0) { + feature_counts[pos_idx * 2] = white_count_shared; + feature_counts[pos_idx * 2 + 1] = black_count_shared; + } +} + +// ============================================================================ +// Feature Transform with Warp Shuffle +// ============================================================================ + +/** + * Feature transform using advanced warp shuffle for feature broadcast + * Achieves better memory coalescing than standard approach + */ +__global__ void feature_transform_simd( + const weight_t *__restrict__ weights, + const weight_t *__restrict__ biases, + const int32_t *__restrict__ features, + const uint32_t *__restrict__ feature_counts, + accumulator_t *__restrict__ accumulators, + int hidden_dim, int batch_size, int max_features_per_pos) { + + int pos_idx = blockIdx.y; + if (pos_idx >= batch_size) return; + + auto block = cg::this_thread_block(); + auto warp = cg::tiled_partition<32>(block); + + int warp_id = threadIdx.x / 32; + int lane = threadIdx.x % 32; + + // Each warp processes 32 hidden dimensions + int hidden_base = (blockIdx.x * (blockDim.x / 32) + warp_id) * 32; + int hidden_idx = hidden_base + lane; + + if (hidden_idx >= hidden_dim) return; + + // Start with bias + accumulator_t acc = static_cast(biases[hidden_idx]); + + // Feature counts are stored as [white, black] for each position + // For now, we process white features (index 0). This should be extended + // to handle both perspectives or the caller should specify which perspective. + int count = feature_counts[pos_idx * 2]; // Use white features + const int32_t *pos_features = features + pos_idx * max_features_per_pos; + + // Process features with warp-level cooperation + // Note: Broadcasting one feature at a time provides good coalesced access + // to weights. Alternative approaches (shared memory or processing multiple + // features) trade off register pressure and may not improve performance. + // This simple approach keeps registers low and allows high occupancy. + for (int i = 0; i < count; i++) { + // Lane 0 reads the feature index + int32_t feat_idx = (lane == 0) ? pos_features[i] : 0; + + // Broadcast to all lanes in warp using shuffle + feat_idx = warp.shfl(feat_idx, 0); + + if (feat_idx >= 0 && feat_idx < HALFKA_DIMS) { + // All lanes read coalesced weight access + // Each thread reads weights[feat_idx * hidden_dim + hidden_idx] + // where hidden_idx is unique per thread (hidden_base + lane) + // This ensures perfect coalescing across the warp + acc += weights[feat_idx * hidden_dim + hidden_idx]; + } + } + + accumulators[pos_idx * hidden_dim + hidden_idx] = acc; +} + +// ============================================================================ +// FC Layer with Warp Reduction +// ============================================================================ + +/** + * Fully connected layer using warp-level sum reduction + * Much faster than atomic operations or shared memory + */ +__global__ void fc_layer_simd( + const int8_t *__restrict__ input, + const layer_weight_t *__restrict__ weights, + const int32_t *__restrict__ biases, + int8_t *__restrict__ output, + int input_size, int output_size, int batch_size) { + + int pos_idx = blockIdx.x; + int out_idx = blockIdx.y; + + if (pos_idx >= batch_size || out_idx >= output_size) return; + + const int8_t *in_ptr = input + pos_idx * input_size; + const layer_weight_t *w_ptr = weights + out_idx * input_size; + + // Each thread processes a subset of inputs + int32_t partial_sum = 0; + for (int i = threadIdx.x; i < input_size; i += blockDim.x) { + partial_sum += static_cast(in_ptr[i]) * w_ptr[i]; + } + + // Warp-level reduction + partial_sum = warp_reduce_sum(partial_sum); + + // First thread in each warp writes to shared memory + __shared__ int32_t warp_sums[32]; + int lane = threadIdx.x % 32; + int wid = threadIdx.x / 32; + + if (lane == 0) { + warp_sums[wid] = partial_sum; + } + __syncthreads(); + + // Final reduction by first warp + if (wid == 0) { + partial_sum = (lane < blockDim.x / 32) ? warp_sums[lane] : 0; + partial_sum = warp_reduce_sum(partial_sum); + + if (lane == 0) { + partial_sum += biases[out_idx]; + output[pos_idx * output_size + out_idx] = + clipped_relu(static_cast(partial_sum >> WEIGHT_SCALE_BITS)); + } + } +} + +// ============================================================================ +// Batch Evaluation with Cooperative Groups +// ============================================================================ + +/** + * Complete NNUE evaluation using cooperative groups + * Enables better thread cooperation and grid-wide synchronization + */ +__global__ void batch_evaluate_simd( + const accumulator_t *__restrict__ accumulators, + const layer_weight_t *__restrict__ fc0_weights, + const int32_t *__restrict__ fc0_biases, + const layer_weight_t *__restrict__ fc1_weights, + const int32_t *__restrict__ fc1_biases, + const layer_weight_t *__restrict__ fc2_weights, + const int32_t *__restrict__ fc2_biases, + int32_t *__restrict__ output, + int hidden_dim, int batch_size) { + + auto grid = cg::this_grid(); + auto block = cg::this_thread_block(); + auto warp = cg::tiled_partition<32>(block); + + int pos_idx = blockIdx.x; + if (pos_idx >= batch_size) return; + + __shared__ int8_t fc0_sqr[2 * 16]; + __shared__ int8_t fc0_linear[2]; + __shared__ int8_t fc1_out[32]; + + const accumulator_t *white_acc = accumulators + pos_idx * 2 * hidden_dim; + const accumulator_t *black_acc = white_acc + hidden_dim; + + int lane = threadIdx.x % 32; + int wid = threadIdx.x / 32; + + // FC0 layer - process both perspectives in parallel with warp-level cooperation + for (int p = 0; p < 2; p++) { + const accumulator_t *acc = (p == 0) ? white_acc : black_acc; + + // Each warp cooperatively computes all FC0 outputs + for (int out = 0; out <= FC0_OUT; ++out) { + // Lane 0 starts from bias; other lanes start from 0 to avoid double-counting + int32_t sum = (lane == 0) ? fc0_biases[out] : 0; + + // Warp-level reduction over hidden dims: strided accumulation per lane + for (int i = lane; i < hidden_dim; i += 32) { + int8_t clipped = clipped_relu( + static_cast(acc[i] >> WEIGHT_SCALE_BITS)); + sum += clipped * fc0_weights[i * (FC0_OUT + 1) + out]; + } + + // Reduce partial sums across the warp + sum = warp_reduce_sum(sum); + + if (lane == 0) { + int16_t result = static_cast(sum >> WEIGHT_SCALE_BITS); + if (out < FC0_OUT) { + fc0_sqr[p * FC0_OUT + out] = sqr_clipped_relu(result); + } else { + fc0_linear[p] = clipped_relu(result); + } + } + } + } + block.sync(); + + // FC1 layer + if (lane < FC1_OUT) { + int32_t sum = fc1_biases[lane]; + for (int i = 0; i < 2 * FC0_OUT; i++) { + sum += fc0_sqr[i] * fc1_weights[i * FC1_OUT + lane]; + } + fc1_out[lane] = clipped_relu(static_cast(sum >> WEIGHT_SCALE_BITS)); + } + block.sync(); + + // FC2 layer with skip connection + if (threadIdx.x == 0) { + int32_t sum = fc2_biases[0]; + for (int i = 0; i < FC1_OUT; i++) { + sum += fc1_out[i] * fc2_weights[i]; + } + + // Add skip connection + int32_t skip_val = ((fc0_linear[0] + fc0_linear[1]) * 600 * OUTPUT_SCALE) / + (2 * 127 * (1 << WEIGHT_SCALE_BITS)); + output[pos_idx] = sum + skip_val; + } +} + +// ============================================================================ +// PSQT Accumulation with Warp Reduction +// ============================================================================ + +/** + * PSQT (Piece-Square Table) accumulation using warp primitives + */ +__global__ void psqt_accumulate_simd( + const int32_t *__restrict__ features, + const uint32_t *__restrict__ feature_counts, + const int32_t *__restrict__ psqt_weights, + int32_t *__restrict__ psqt_values, + int batch_size, int max_features, int num_buckets) { + + int pos_idx = blockIdx.x; + if (pos_idx >= batch_size) return; + + auto warp = cg::tiled_partition<32>(cg::this_thread_block()); + int lane = warp.thread_rank(); + + int count = feature_counts[pos_idx]; + const int32_t *pos_features = features + pos_idx * max_features; + + // Each thread accumulates a subset of features + int32_t partial_sum = 0; + for (int i = lane; i < count; i += 32) { + int feat_idx = pos_features[i]; + if (feat_idx >= 0) { + partial_sum += psqt_weights[feat_idx]; + } + } + + // Warp-level sum reduction + partial_sum = warp_reduce_sum(partial_sum); + + // Lane 0 writes the result + if (lane == 0) { + psqt_values[pos_idx] = partial_sum; + } +} + +// ============================================================================ +// Host Interface Functions +// ============================================================================ + +extern "C" { + +void cuda_feature_transform_simd( + const weight_t *weights, const weight_t *biases, + const int32_t *features, const uint32_t *feature_counts, + accumulator_t *accumulators, int hidden_dim, int batch_size, + int max_features_per_pos, cudaStream_t stream) { + + dim3 block(256); // 8 warps per block + dim3 grid((hidden_dim + 255) / 256, batch_size); + + feature_transform_simd<<>>( + weights, biases, features, feature_counts, accumulators, + hidden_dim, batch_size, max_features_per_pos); +} + +void cuda_fc_layer_simd( + const int8_t *input, const layer_weight_t *weights, + const int32_t *biases, int8_t *output, + int input_size, int output_size, int batch_size, cudaStream_t stream) { + + dim3 block(128); // 4 warps per block + dim3 grid(batch_size, output_size); + + fc_layer_simd<<>>( + input, weights, biases, output, input_size, output_size, batch_size); +} + +void cuda_batch_evaluate_simd( + const accumulator_t *accumulators, + const layer_weight_t *fc0_weights, const int32_t *fc0_biases, + const layer_weight_t *fc1_weights, const int32_t *fc1_biases, + const layer_weight_t *fc2_weights, const int32_t *fc2_biases, + int32_t *output, int hidden_dim, int batch_size, cudaStream_t stream) { + + dim3 block(128); + dim3 grid(batch_size); + + batch_evaluate_simd<<>>( + accumulators, fc0_weights, fc0_biases, fc1_weights, fc1_biases, + fc2_weights, fc2_biases, output, hidden_dim, batch_size); +} + +void cuda_psqt_accumulate_simd( + const int32_t *features, const uint32_t *feature_counts, + const int32_t *psqt_weights, int32_t *psqt_values, + int batch_size, int max_features, int num_buckets, cudaStream_t stream) { + + dim3 block(32); // Single warp + dim3 grid(batch_size); + + psqt_accumulate_simd<<>>( + features, feature_counts, psqt_weights, psqt_values, + batch_size, max_features, num_buckets); +} + +} // extern "C" + +#endif // NNUE_CUDA_SIMD_CU diff --git a/src/gpu/cuda/kernels/nnue_simd.h b/src/gpu/cuda/kernels/nnue_simd.h new file mode 100644 index 00000000..11ecce4f --- /dev/null +++ b/src/gpu/cuda/kernels/nnue_simd.h @@ -0,0 +1,55 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + CUDA NNUE SIMD Kernels Header + + Interface for warp-optimized CUDA kernels. +*/ + +#ifndef NNUE_CUDA_SIMD_H +#define NNUE_CUDA_SIMD_H + +#include +#include + +using weight_t = int16_t; +using layer_weight_t = int8_t; +using accumulator_t = int32_t; + +#ifdef __cplusplus +extern "C" { +#endif + +// Feature transform with warp shuffle optimization +void cuda_feature_transform_simd( + const weight_t *weights, const weight_t *biases, + const int32_t *features, const uint32_t *feature_counts, + accumulator_t *accumulators, int hidden_dim, int batch_size, + int max_features_per_pos, cudaStream_t stream); + +// FC layer with warp reduction +void cuda_fc_layer_simd( + const int8_t *input, const layer_weight_t *weights, + const int32_t *biases, int8_t *output, + int input_size, int output_size, int batch_size, cudaStream_t stream); + +// Batch evaluation with cooperative groups +void cuda_batch_evaluate_simd( + const accumulator_t *accumulators, + const layer_weight_t *fc0_weights, const int32_t *fc0_biases, + const layer_weight_t *fc1_weights, const int32_t *fc1_biases, + const layer_weight_t *fc2_weights, const int32_t *fc2_biases, + int32_t *output, int hidden_dim, int batch_size, cudaStream_t stream); + +// PSQT accumulation with warp reduction +void cuda_psqt_accumulate_simd( + const int32_t *features, const uint32_t *feature_counts, + const int32_t *psqt_weights, int32_t *psqt_values, + int batch_size, int max_features, int num_buckets, cudaStream_t stream); + +#ifdef __cplusplus +} +#endif + +#endif // NNUE_CUDA_SIMD_H diff --git a/src/gpu/cuda/kernels/nnue_tensor_core.cu b/src/gpu/cuda/kernels/nnue_tensor_core.cu new file mode 100644 index 00000000..cb91c6fa --- /dev/null +++ b/src/gpu/cuda/kernels/nnue_tensor_core.cu @@ -0,0 +1,441 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + CUDA NNUE Tensor Core Kernels + + Tensor core accelerated kernels using WMMA API for maximum performance + on Volta (SM 7.0) and later architectures. +*/ + +#ifndef NNUE_CUDA_TENSOR_CORE_CU +#define NNUE_CUDA_TENSOR_CORE_CU + +#include +#include +#include +#include + +// Include WMMA (Warp Matrix Multiply-Accumulate) API +#if __CUDA_ARCH__ >= 700 +#include +using namespace nvcuda::wmma; +#endif + +// ============================================================================ +// Architecture Constants +// ============================================================================ + +constexpr int FT_DIM_BIG = 1024; +constexpr int FT_DIM_SMALL = 128; +constexpr int FC0_OUT = 15; +constexpr int FC1_OUT = 32; +constexpr int WEIGHT_SCALE_BITS = 6; +constexpr int OUTPUT_SCALE = 16; + +// WMMA tile sizes (16x16x16 for FP16) +constexpr int WMMA_M = 16; +constexpr int WMMA_N = 16; +constexpr int WMMA_K = 16; + +using layer_weight_t = int8_t; +using accumulator_t = int32_t; + +// ============================================================================ +// Activation Functions +// ============================================================================ + +__device__ __forceinline__ int8_t clipped_relu(int16_t x) { + return static_cast(max(0, min(127, static_cast(x)))); +} + +__device__ __forceinline__ int8_t sqr_clipped_relu(int16_t x) { + int clamped = max(0, min(127, static_cast(x))); + return static_cast((clamped * clamped) >> 7); +} + +// ============================================================================ +// FP16 Conversion Helpers +// ============================================================================ + +/** + * Convert int8 activation to half precision + */ +__device__ __forceinline__ half int8_to_half(int8_t x) { + return __int2half_rn(static_cast(x)); +} + +/** + * Convert half precision back to int8 with clipping + */ +__device__ __forceinline__ int8_t half_to_int8_clipped(half x) { + int val = __half2int_rn(x); + return static_cast(max(0, min(127, val))); +} + +// ============================================================================ +// Tensor Core FC Layer (FP16) +// ============================================================================ + +#if __CUDA_ARCH__ >= 700 + +/** + * Fully connected layer using tensor cores (WMMA API) + * Input: [batch_size, input_size] in FP16 + * Weights: [output_size, input_size] in FP16 + * Output: [batch_size, output_size] in FP16 + * + * Uses 16x16x16 tiles for optimal tensor core utilization + */ +__global__ void fc_layer_tensor_core_fp16( + const half *__restrict__ input, + const half *__restrict__ weights, + const half *__restrict__ biases, + half *__restrict__ output, + int batch_size, int input_size, int output_size) { + + // Warp and lane IDs + int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / 32; + int warpN = blockIdx.y; + + // Declare the fragments + fragment a_frag; + fragment b_frag; + fragment c_frag; + + // Initialize the output to zero + fill_fragment(c_frag, __float2half(0.0f)); + + // Bounds check + if (warpM * WMMA_M >= batch_size || warpN * WMMA_N >= output_size) { + return; + } + + // Matrix multiply: C = A * B^T + // A: [batch_size, input_size] + // B: [output_size, input_size] (transposed to col_major) + for (int k = 0; k < input_size; k += WMMA_K) { + int aRow = warpM * WMMA_M; + int aCol = k; + int bRow = k; + int bCol = warpN * WMMA_N; + + // Load A fragment (input activations) + if (aRow < batch_size && aCol < input_size) { + load_matrix_sync(a_frag, input + aRow * input_size + aCol, input_size); + } + + // Load B fragment (weights, transposed) + if (bCol < output_size && bRow < input_size) { + load_matrix_sync(b_frag, weights + bCol * input_size + bRow, input_size); + } + + // Perform the matrix multiply-accumulate + mma_sync(c_frag, a_frag, b_frag, c_frag); + } + + // Store the output first (WMMA handles the fragment layout automatically) + int cRow = warpM * WMMA_M; + int cCol = warpN * WMMA_N; + if (cRow < batch_size && cCol < output_size) { + store_matrix_sync(output + cRow * output_size + cCol, c_frag, + output_size, mem_row_major); + } + + // Add biases in global memory to avoid fragment layout assumptions + // Only one thread per warp does this to avoid races + if (biases != nullptr && threadIdx.x % 32 == 0) { + for (int row = 0; row < WMMA_M && (cRow + row) < batch_size; ++row) { + for (int col = 0; col < WMMA_N && (cCol + col) < output_size; ++col) { + int global_col = cCol + col; + int out_index = (cRow + row) * output_size + global_col; + output[out_index] = __hadd(output[out_index], biases[global_col]); + } + } + } +} + +/** + * FC0 layer using tensor cores + * Converts int32 accumulators to FP16, applies tensor cores, converts back + */ +__global__ void fc0_layer_tensor_core( + const accumulator_t *__restrict__ accumulators, + const half *__restrict__ weights_fp16, + const half *__restrict__ biases_fp16, + int8_t *__restrict__ output_sqr, + int8_t *__restrict__ output_linear, + int hidden_dim, int batch_size) { + + extern __shared__ half shared_mem[]; + half *input_fp16 = shared_mem; + half *output_fp16 = shared_mem + blockDim.x * hidden_dim; + + int pos_idx = blockIdx.x; + if (pos_idx >= batch_size) return; + + const accumulator_t *white_acc = accumulators + pos_idx * 2 * hidden_dim; + const accumulator_t *black_acc = white_acc + hidden_dim; + + // Convert both perspectives to FP16 + for (int i = threadIdx.x; i < 2 * hidden_dim; i += blockDim.x) { + const accumulator_t *acc = (i < hidden_dim) ? white_acc : black_acc; + int idx = (i < hidden_dim) ? i : i - hidden_dim; + + // Apply clipped ReLU and convert to FP16 + int16_t val = static_cast(acc[idx] >> WEIGHT_SCALE_BITS); + int8_t clipped = clipped_relu(val); + input_fp16[i] = __int2half_rn(clipped); + } + __syncthreads(); + + // Compute dot product between input_fp16 (length 2 * hidden_dim) and + // weights_fp16 row for this output, using warp-level primitives + // Note: This version avoids WMMA misuse and uses simple FP16 operations + int warp_id = threadIdx.x / 32; + int lane = threadIdx.x % 32; + + if (warp_id < (FC0_OUT + 1)) { + int out_idx = warp_id; + + // Each thread in the warp accumulates over a strided subset of features + half local_sum = __float2half(0.0f); + for (int k = lane; k < 2 * hidden_dim; k += warpSize) { + half in_val = input_fp16[k]; + half w_val = weights_fp16[out_idx * 2 * hidden_dim + k]; + local_sum = __hadd(local_sum, __hmul(in_val, w_val)); + } + + // Warp-level reduction to get total sum + for (int offset = 16; offset > 0; offset /= 2) { + local_sum = __hadd(local_sum, __shfl_down_sync(0xffffffff, local_sum, offset)); + } + + // Only lane 0 has the final sum, add bias and store + if (lane == 0) { + local_sum = __hadd(local_sum, biases_fp16[out_idx]); + int16_t result = __half2int_rn(local_sum); + + // Store squared and linear outputs + if (out_idx < FC0_OUT) { + output_sqr[pos_idx * 2 * FC0_OUT + out_idx] = sqr_clipped_relu(result); + output_sqr[pos_idx * 2 * FC0_OUT + FC0_OUT + out_idx] = sqr_clipped_relu(result); + } else { + output_linear[pos_idx * 2] = clipped_relu(result); + output_linear[pos_idx * 2 + 1] = clipped_relu(result); + } + } + } +} + +/** + * Fused NNUE evaluation using tensor cores throughout + */ +__global__ void nnue_forward_tensor_core( + const accumulator_t *__restrict__ accumulators, + const half *__restrict__ fc0_weights, + const half *__restrict__ fc0_biases, + const half *__restrict__ fc1_weights, + const half *__restrict__ fc1_biases, + const half *__restrict__ fc2_weights, + const half *__restrict__ fc2_biases, + int32_t *__restrict__ output, + int hidden_dim, int batch_size) { + + extern __shared__ half shared_mem[]; + + int pos_idx = blockIdx.x; + if (pos_idx >= batch_size) return; + + half *fc0_input = shared_mem; + half *fc0_output = shared_mem + 2 * hidden_dim; + half *fc1_output = fc0_output + 2 * (FC0_OUT + 1); + + const accumulator_t *white_acc = accumulators + pos_idx * 2 * hidden_dim; + const accumulator_t *black_acc = white_acc + hidden_dim; + + // Convert accumulators to FP16 + for (int i = threadIdx.x; i < 2 * hidden_dim; i += blockDim.x) { + const accumulator_t *acc = (i < hidden_dim) ? white_acc : black_acc; + int idx = (i < hidden_dim) ? i : i - hidden_dim; + int16_t val = static_cast(acc[idx] >> WEIGHT_SCALE_BITS); + fc0_input[i] = __int2half_rn(clipped_relu(val)); + } + __syncthreads(); + + // FC0 layer with tensor cores (simplified) + // ... (tensor core matrix multiply) + + // FC1 layer with tensor cores + // ... (tensor core matrix multiply) + + // FC2 layer (small, can use standard multiplication) + if (threadIdx.x == 0) { + half sum = fc2_biases[0]; + for (int i = 0; i < FC1_OUT; i++) { + sum = __hfma(fc1_output[i], fc2_weights[i], sum); + } + output[pos_idx] = __half2int_rn(sum); + } +} + +#endif // __CUDA_ARCH__ >= 700 + +// ============================================================================ +// INT8 Tensor Core Support (Turing SM 7.5+) +// ============================================================================ + +#if __CUDA_ARCH__ >= 750 + +/** + * FC layer using INT8 tensor cores (Turing and later) + * Provides even better performance for quantized inference + */ +__global__ void fc_layer_tensor_core_int8( + const int8_t *__restrict__ input, + const int8_t *__restrict__ weights, + const int32_t *__restrict__ biases, + int8_t *__restrict__ output, + int batch_size, int input_size, int output_size) { + + // INT8 tensor cores use 8x8x16 tiles on Turing + // 16x8x16 tiles on Ampere and later + + // Warp and lane IDs + int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / 32; + int warpN = blockIdx.y; + + // Note: INT8 WMMA requires different fragment types + // This is a simplified example - full implementation would use + // appropriate fragment types for INT8 + + // Bounds check + if (warpM * 16 >= batch_size || warpN * 16 >= output_size) { + return; + } + + // INT8 tensor core implementation would go here + // For now, this serves as a placeholder for future optimization +} + +#endif // __CUDA_ARCH__ >= 750 + +// ============================================================================ +// Host Interface Functions +// ============================================================================ + +extern "C" { + +/** + * Check if tensor cores are available on the current device + */ +bool cuda_tensor_cores_available(int device_id) { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, device_id); + // Tensor cores available on SM 7.0 (Volta) and later + return prop.major >= 7; +} + +/** + * Check if INT8 tensor cores are available + */ +bool cuda_int8_tensor_cores_available(int device_id) { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, device_id); + // INT8 tensor cores available on SM 7.5 (Turing) and later + return (prop.major > 7) || (prop.major == 7 && prop.minor >= 5); +} + +// Tensor core function implementations are architecture-specific +// and must be compiled with appropriate -arch flags + +/** + * FC layer with FP16 tensor cores + * Only available when compiled for SM 7.0+ + */ +void cuda_fc_layer_tensor_core_fp16( + const half *input, const half *weights, const half *biases, + half *output, int batch_size, int input_size, int output_size, + cudaStream_t stream) { + + // Runtime check for architecture support + int device; + cudaGetDevice(&device); + if (!cuda_tensor_cores_available(device)) { + std::cerr << "[CUDA] Tensor cores not available on this device" << std::endl; + return; + } + + dim3 block(128); // 4 warps per block + dim3 grid((batch_size + 15) / 16, // WMMA_M = 16 + (output_size + 15) / 16); // WMMA_N = 16 + + // Launch the kernel - it will be compiled for all architectures in CMAKE_CUDA_ARCHITECTURES + // The kernel code is conditionally compiled based on __CUDA_ARCH__ during device compilation + fc_layer_tensor_core_fp16<<>>( + input, weights, biases, output, batch_size, input_size, output_size); +} + +/** + * FC0 layer with tensor cores + * Only available when compiled for SM 7.0+ + */ +void cuda_fc0_layer_tensor_core( + const accumulator_t *accumulators, + const half *weights_fp16, const half *biases_fp16, + int8_t *output_sqr, int8_t *output_linear, + int hidden_dim, int batch_size, cudaStream_t stream) { + + int device; + cudaGetDevice(&device); + if (!cuda_tensor_cores_available(device)) { + std::cerr << "[CUDA] Tensor cores not available on this device" << std::endl; + return; + } + + dim3 block(128); + dim3 grid(batch_size); + size_t shared_mem = (2 * hidden_dim + 2 * (FC0_OUT + 1)) * sizeof(half); + + // Launch the kernel - it will be compiled for all architectures in CMAKE_CUDA_ARCHITECTURES + fc0_layer_tensor_core<<>>( + accumulators, weights_fp16, biases_fp16, + output_sqr, output_linear, hidden_dim, batch_size); +} + +/** + * Full NNUE forward pass with tensor cores + * Note: This is a simplified implementation. Full implementation would require + * complete tensor core matrix operations for all layers. + * Only available when compiled for SM 7.0+ + */ +void cuda_nnue_forward_tensor_core( + const accumulator_t *accumulators, + const half *fc0_weights, const half *fc0_biases, + const half *fc1_weights, const half *fc1_biases, + const half *fc2_weights, const half *fc2_biases, + int32_t *output, int hidden_dim, int batch_size, cudaStream_t stream) { + + int device; + cudaGetDevice(&device); + if (!cuda_tensor_cores_available(device)) { + std::cerr << "[CUDA] Tensor cores not available on this device" << std::endl; + return; + } + + // TODO: Implement full tensor core forward pass + // Currently this is a placeholder that demonstrates the API + // A complete implementation would: + // 1. Convert accumulators to FP16 + // 2. Use tensor cores for FC0 layer (hidden_dim -> FC0_OUT) + // 3. Use tensor cores for FC1 layer (FC0_OUT -> FC1_OUT) + // 4. Use standard ops for FC2 (small output, not worth tensor cores) + // 5. Apply activations and skip connections + + std::cerr << "[CUDA] Full tensor core forward pass not yet implemented" << std::endl; + std::cerr << "[CUDA] Use individual layer functions instead" << std::endl; +} + +} // extern "C" + +#endif // NNUE_CUDA_TENSOR_CORE_CU diff --git a/src/gpu/cuda/kernels/nnue_tensor_core.h b/src/gpu/cuda/kernels/nnue_tensor_core.h new file mode 100644 index 00000000..197b04fe --- /dev/null +++ b/src/gpu/cuda/kernels/nnue_tensor_core.h @@ -0,0 +1,55 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + CUDA NNUE Tensor Core Kernels Header + + Interface for tensor core accelerated kernels. +*/ + +#ifndef NNUE_CUDA_TENSOR_CORE_H +#define NNUE_CUDA_TENSOR_CORE_H + +#include +#include +#include + +using accumulator_t = int32_t; +using layer_weight_t = int8_t; + +#ifdef __cplusplus +extern "C" { +#endif + +// Check if tensor cores are available +bool cuda_tensor_cores_available(int device_id); + +// Check if INT8 tensor cores are available +bool cuda_int8_tensor_cores_available(int device_id); + +// FC layer with FP16 tensor cores +void cuda_fc_layer_tensor_core_fp16( + const half *input, const half *weights, const half *biases, + half *output, int batch_size, int input_size, int output_size, + cudaStream_t stream); + +// FC0 layer with tensor cores +void cuda_fc0_layer_tensor_core( + const accumulator_t *accumulators, + const half *weights_fp16, const half *biases_fp16, + int8_t *output_sqr, int8_t *output_linear, + int hidden_dim, int batch_size, cudaStream_t stream); + +// Full NNUE forward pass with tensor cores +void cuda_nnue_forward_tensor_core( + const accumulator_t *accumulators, + const half *fc0_weights, const half *fc0_biases, + const half *fc1_weights, const half *fc1_biases, + const half *fc2_weights, const half *fc2_biases, + int32_t *output, int hidden_dim, int batch_size, cudaStream_t stream); + +#ifdef __cplusplus +} +#endif + +#endif // NNUE_CUDA_TENSOR_CORE_H diff --git a/tests/test_cuda_advanced.cpp b/tests/test_cuda_advanced.cpp new file mode 100644 index 00000000..ed13b2aa --- /dev/null +++ b/tests/test_cuda_advanced.cpp @@ -0,0 +1,258 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + Advanced CUDA Features Test + + Tests for CUDA graphs, multi-GPU, persistent kernels, and FP16 weights. +*/ + +#include +#include +#include +#include + +#ifdef USE_CUDA + +#include "../src/gpu/cuda/cuda_backend.h" +#include "../src/gpu/cuda/cuda_graphs.h" +#include "../src/gpu/cuda/cuda_multi_gpu.h" +#include "../src/gpu/cuda/cuda_fp16_weights.h" + +using namespace MetalFish::GPU; +using namespace MetalFish::GPU::CUDA; + +// ============================================================================ +// CUDA Graphs Tests +// ============================================================================ + +bool test_cuda_graphs() { + std::cout << "\n[Test] CUDA Graphs" << std::endl; + + GraphManager manager; + cudaStream_t stream; + cudaStreamCreate(&stream); + + // Test graph capture + bool started = manager.begin_capture(stream, "test_graph"); + if (!started) { + std::cerr << " Failed to begin capture" << std::endl; + cudaStreamDestroy(stream); + return false; + } + + // Simulate some operations (empty kernel for test) + void *dummy_buffer; + cudaMalloc(&dummy_buffer, 1024); + cudaMemsetAsync(dummy_buffer, 0, 1024, stream); + + bool ended = manager.end_capture(stream, "test_graph"); + if (!ended) { + std::cerr << " Failed to end capture" << std::endl; + cudaFree(dummy_buffer); + cudaStreamDestroy(stream); + return false; + } + + // Test graph replay + bool launched = manager.launch_graph("test_graph", stream); + if (!launched) { + std::cerr << " Failed to launch graph" << std::endl; + cudaFree(dummy_buffer); + cudaStreamDestroy(stream); + return false; + } + + cudaStreamSynchronize(stream); + + // Check statistics + auto stats = manager.get_stats(); + std::cout << " Graphs: " << stats.num_graphs + << ", Nodes: " << stats.total_nodes << std::endl; + + cudaFree(dummy_buffer); + cudaStreamDestroy(stream); + + std::cout << " PASSED" << std::endl; + return true; +} + +// ============================================================================ +// Multi-GPU Tests +// ============================================================================ + +bool test_multi_gpu() { + std::cout << "\n[Test] Multi-GPU Support" << std::endl; + + MultiGPUManager manager; + + // Initialize with all GPUs + if (!manager.initialize(true)) { + std::cout << " SKIPPED (no GPUs available)" << std::endl; + return true; + } + + int num_gpus = manager.get_num_gpus(); + std::cout << " Number of GPUs: " << num_gpus << std::endl; + + // Test GPU enumeration + for (int i = 0; i < num_gpus; i++) { + const auto& info = manager.get_gpu_info(i); + std::cout << " GPU " << i << ": " << info.name + << " (SM " << info.compute_major << "." << info.compute_minor << ")" + << std::endl; + } + + // Test batch distribution + int batch_size = 1024; + auto distribution = manager.distribute_batch(batch_size); + + int total = 0; + for (size_t i = 0; i < distribution.size(); i++) { + std::cout << " GPU " << i << " gets " << distribution[i] << " items" << std::endl; + total += distribution[i]; + } + + if (total != batch_size) { + std::cerr << " Batch distribution mismatch: " << total << " vs " << batch_size << std::endl; + return false; + } + + // Test peer access if multiple GPUs + if (num_gpus > 1) { + manager.enable_peer_access(); + } + + std::cout << " PASSED" << std::endl; + return true; +} + +// ============================================================================ +// FP16 Weights Tests +// ============================================================================ + +bool test_fp16_weights() { + std::cout << "\n[Test] FP16 Weight Storage" << std::endl; + + FP16WeightManager manager; + + // Create test weights + const size_t size = 1024; + std::vector int16_weights(size); + std::vector int32_biases(32); + + for (size_t i = 0; i < size; i++) { + int16_weights[i] = static_cast(i % 128); + } + + for (size_t i = 0; i < 32; i++) { + int32_biases[i] = static_cast(i * 64); + } + + // Convert to FP16 + half* fp16_weights = manager.convert_and_store_weights(int16_weights.data(), size); + if (!fp16_weights) { + std::cerr << " Failed to convert weights" << std::endl; + return false; + } + + half* fp16_biases = manager.convert_and_store_biases(int32_biases.data(), 32); + if (!fp16_biases) { + std::cerr << " Failed to convert biases" << std::endl; + return false; + } + + // Verify conversion by copying back + std::vector verify_weights(size); + cudaMemcpy(verify_weights.data(), fp16_weights, size * sizeof(half), + cudaMemcpyDeviceToHost); + + // Check a few values + for (size_t i = 0; i < 10; i++) { + float expected = static_cast(int16_weights[i]) / 64.0f; + float actual = __half2float(verify_weights[i]); + if (std::abs(expected - actual) > 0.01f) { + std::cerr << " Conversion mismatch at index " << i << std::endl; + return false; + } + } + + size_t mem_usage = manager.get_memory_usage(); + std::cout << " Memory usage: " << (mem_usage / 1024) << " KB" << std::endl; + + std::cout << " PASSED" << std::endl; + return true; +} + +// ============================================================================ +// Backend Integration Test +// ============================================================================ + +bool test_backend_features() { + std::cout << "\n[Test] Backend Feature Integration" << std::endl; + + auto &backend = CUDABackend::instance(); + + if (!backend.is_available()) { + std::cout << " SKIPPED (no CUDA device)" << std::endl; + return true; + } + + // Test feature enablement + backend.enable_cuda_graphs(true); + backend.enable_multi_gpu(false); // Keep single GPU for simplicity + backend.enable_persistent_kernels(false); + backend.enable_fp16_weights(backend.has_tensor_cores()); + + std::cout << " CUDA Graphs: " << (backend.is_cuda_graphs_enabled() ? "ON" : "OFF") << std::endl; + std::cout << " Multi-GPU: " << (backend.is_multi_gpu_enabled() ? "ON" : "OFF") << std::endl; + std::cout << " Persistent Kernels: " << (backend.is_persistent_kernels_enabled() ? "ON" : "OFF") << std::endl; + std::cout << " FP16 Weights: " << (backend.is_fp16_weights_enabled() ? "ON" : "OFF") << std::endl; + + std::cout << " PASSED" << std::endl; + return true; +} + +// ============================================================================ +// Main Test Runner +// ============================================================================ + +int main() { + std::cout << "======================================" << std::endl; + std::cout << "Advanced CUDA Features Tests" << std::endl; + std::cout << "======================================" << std::endl; + + int passed = 0; + int failed = 0; + + // Run tests + if (test_cuda_graphs()) passed++; else failed++; + if (test_multi_gpu()) passed++; else failed++; + if (test_fp16_weights()) passed++; else failed++; + if (test_backend_features()) passed++; else failed++; + + // Print summary + std::cout << "\n======================================" << std::endl; + std::cout << "Test Summary" << std::endl; + std::cout << "======================================" << std::endl; + std::cout << "Passed: " << passed << std::endl; + std::cout << "Failed: " << failed << std::endl; + std::cout << "Total: " << (passed + failed) << std::endl; + + if (failed == 0) { + std::cout << "\nAll tests PASSED! ✓" << std::endl; + } else { + std::cout << "\nSome tests FAILED! ✗" << std::endl; + } + + return (failed == 0) ? 0 : 1; +} + +#else // !USE_CUDA + +int main() { + std::cout << "CUDA support not enabled. Skipping tests." << std::endl; + return 0; +} + +#endif // USE_CUDA diff --git a/tests/test_cuda_optimizations.cpp b/tests/test_cuda_optimizations.cpp new file mode 100644 index 00000000..ae75943c --- /dev/null +++ b/tests/test_cuda_optimizations.cpp @@ -0,0 +1,361 @@ +/* + MetalFish - A GPU-accelerated UCI chess engine + Copyright (C) 2025 Nripesh Niketan + + CUDA Optimization Tests + + Tests for tensor cores, warp primitives, and memory optimizations. +*/ + +#include +#include +#include +#include + +#ifdef USE_CUDA + +#include "../src/gpu/cuda/cuda_backend.h" +#include "../src/gpu/cuda/cuda_memory.h" +#include "../src/gpu/cuda/cuda_profiling.h" +#include "../src/gpu/cuda/kernels/nnue_simd.h" + +#ifdef USE_CUDA_TENSOR_CORES +#include "../src/gpu/cuda/kernels/nnue_tensor_core.h" +#endif + +using namespace MetalFish::GPU; + +namespace { + +// Helper function to compare arrays with tolerance +template +bool arrays_equal(const T *a, const T *b, size_t n, float tolerance = 1e-4f) { + for (size_t i = 0; i < n; i++) { + float diff = std::abs(static_cast(a[i]) - static_cast(b[i])); + if (diff > tolerance) { + std::cerr << "Mismatch at index " << i << ": " << a[i] << " vs " << b[i] + << " (diff: " << diff << ")" << std::endl; + return false; + } + } + return true; +} + +} // namespace + +// ============================================================================ +// Memory Management Tests +// ============================================================================ + +bool test_unified_memory() { + std::cout << "\n[Test] Unified Memory with Hints" << std::endl; + + const size_t size = 1024 * 1024; // 1MB + int device_id = 0; + + // Test basic unified memory allocation + void *ptr = CUDA::UnifiedMemoryManager::allocate_unified(size, device_id); + if (!ptr) { + std::cerr << " Failed to allocate unified memory" << std::endl; + return false; + } + + // Test read-only allocation + void *readonly_ptr = CUDA::UnifiedMemoryManager::allocate_unified_readonly(size, device_id); + if (!readonly_ptr) { + std::cerr << " Failed to allocate read-only unified memory" << std::endl; + CUDA::UnifiedMemoryManager::free_unified(ptr); + return false; + } + + // Test prefetching + CUDA::UnifiedMemoryManager::prefetch_to_device(ptr, size, device_id); + cudaDeviceSynchronize(); + + CUDA::UnifiedMemoryManager::prefetch_to_host(ptr, size); + cudaDeviceSynchronize(); + + // Cleanup + CUDA::UnifiedMemoryManager::free_unified(ptr); + CUDA::UnifiedMemoryManager::free_unified(readonly_ptr); + + std::cout << " PASSED" << std::endl; + return true; +} + +bool test_pinned_memory() { + std::cout << "\n[Test] Pinned Memory" << std::endl; + + const size_t size = 1024 * 1024; // 1MB + + // Test pinned allocation + void *ptr = CUDA::PinnedMemoryManager::allocate_pinned(size); + if (!ptr) { + std::cerr << " Failed to allocate pinned memory" << std::endl; + return false; + } + + // Test memory registration + std::vector host_mem(size); + if (!CUDA::PinnedMemoryManager::register_pinned(host_mem.data(), size)) { + std::cerr << " Failed to register pinned memory" << std::endl; + CUDA::PinnedMemoryManager::free_pinned(ptr); + return false; + } + + // Cleanup + CUDA::PinnedMemoryManager::unregister_pinned(host_mem.data()); + CUDA::PinnedMemoryManager::free_pinned(ptr); + + std::cout << " PASSED" << std::endl; + return true; +} + +bool test_double_buffer() { + std::cout << "\n[Test] Double Buffer" << std::endl; + + const size_t size = 1024; + int device_id = 0; + + CUDA::DoubleBuffer buffer(size, device_id); + + // Check if buffer was successfully initialized + if (!buffer.is_valid()) { + std::cerr << " Failed to initialize double buffer" << std::endl; + return false; + } + + // Fill buffer with test data + int *host_buf = buffer.get_host_buffer(); + if (!host_buf) { + std::cerr << " Failed to get host buffer" << std::endl; + return false; + } + + for (size_t i = 0; i < size; i++) { + host_buf[i] = static_cast(i); + } + + // First, we need to transfer the current buffer to device before swapping + cudaMemcpy(buffer.get_device_buffer(), host_buf, size * sizeof(int), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + + // Now swap - this prepares for the next iteration + buffer.swap_and_transfer(); + buffer.synchronize(); + + // The current device buffer should still have our data since we just copied it + int *device_buf = buffer.get_device_buffer(); + std::vector result(size); + cudaMemcpy(result.data(), device_buf, size * sizeof(int), cudaMemcpyDeviceToHost); + + for (size_t i = 0; i < size; i++) { + if (result[i] != static_cast(i)) { + std::cerr << " Data mismatch at index " << i << std::endl; + return false; + } + } + + std::cout << " PASSED" << std::endl; + return true; +} + +bool test_memory_pool() { + std::cout << "\n[Test] Memory Pool" << std::endl; + + const size_t pool_size = 10 * 1024 * 1024; // 10MB + int device_id = 0; + + CUDA::MemoryPool pool(pool_size, device_id); + + // Test allocations + void *ptr1 = pool.allocate(1024); + void *ptr2 = pool.allocate(2048); + void *ptr3 = pool.allocate(4096); + + if (!ptr1 || !ptr2 || !ptr3) { + std::cerr << " Failed to allocate from pool" << std::endl; + return false; + } + + size_t allocated = pool.get_allocated(); + if (allocated < 7168) { // 1024 + 2048 + 4096 + std::cerr << " Incorrect allocation size: " << allocated << std::endl; + return false; + } + + // Test reset + pool.reset(); + if (pool.get_allocated() != 0) { + std::cerr << " Pool reset failed" << std::endl; + return false; + } + + std::cout << " PASSED" << std::endl; + return true; +} + +// ============================================================================ +// Profiling Tests +// ============================================================================ + +bool test_kernel_timer() { + std::cout << "\n[Test] Kernel Timer" << std::endl; + + cudaStream_t stream; + cudaStreamCreate(&stream); + + // Allocate a small buffer for the test + void *test_buffer; + cudaMalloc(&test_buffer, 1024); + + { + CUDA::KernelTimer timer("test_kernel", stream); + + // Simulate some work with actual operation + cudaMemsetAsync(test_buffer, 0, 1024, stream); + cudaStreamSynchronize(stream); + } + + float avg_time = CUDA::KernelTimer::get_average_time("test_kernel"); + if (avg_time < 0.0f) { + std::cerr << " Invalid timing result" << std::endl; + cudaFree(test_buffer); + cudaStreamDestroy(stream); + return false; + } + + cudaFree(test_buffer); + cudaStreamDestroy(stream); + + std::cout << " PASSED (avg time: " << avg_time << " ms)" << std::endl; + return true; +} + +bool test_bandwidth_measurement() { + std::cout << "\n[Test] Bandwidth Measurement" << std::endl; + + const size_t test_size = 64 * 1024 * 1024; // 64MB + + float h2d_bandwidth = CUDA::BandwidthTester::measure_h2d_bandwidth(test_size); + float d2h_bandwidth = CUDA::BandwidthTester::measure_d2h_bandwidth(test_size); + + std::cout << " H2D Bandwidth: " << h2d_bandwidth << " GB/s" << std::endl; + std::cout << " D2H Bandwidth: " << d2h_bandwidth << " GB/s" << std::endl; + + if (h2d_bandwidth <= 0.0f || d2h_bandwidth <= 0.0f) { + std::cerr << " Invalid bandwidth measurements" << std::endl; + return false; + } + + std::cout << " PASSED" << std::endl; + return true; +} + +// ============================================================================ +// Tensor Core Tests +// ============================================================================ + +#ifdef USE_CUDA_TENSOR_CORES + +bool test_tensor_core_availability() { + std::cout << "\n[Test] Tensor Core Availability" << std::endl; + + int device_id = 0; + bool has_fp16 = cuda_tensor_cores_available(device_id); + bool has_int8 = cuda_int8_tensor_cores_available(device_id); + + std::cout << " FP16 Tensor Cores: " << (has_fp16 ? "Yes" : "No") << std::endl; + std::cout << " INT8 Tensor Cores: " << (has_int8 ? "Yes" : "No") << std::endl; + + // Just check that the function runs without error + std::cout << " PASSED" << std::endl; + return true; +} + +#endif // USE_CUDA_TENSOR_CORES + +// ============================================================================ +// Architecture Detection Tests +// ============================================================================ + +bool test_architecture_detection() { + std::cout << "\n[Test] Architecture Detection" << std::endl; + + auto &backend = CUDABackend::instance(); + + if (!backend.is_available()) { + std::cout << " SKIPPED (no CUDA device)" << std::endl; + return true; + } + + std::cout << " Device: " << backend.device_name() << std::endl; + std::cout << " Compute Capability: " + << backend.compute_capability_major() << "." + << backend.compute_capability_minor() << std::endl; + std::cout << " Multiprocessors: " << backend.multiprocessor_count() << std::endl; + std::cout << " Total Memory: " << (backend.total_memory() / (1024 * 1024)) << " MB" << std::endl; + std::cout << " Has Tensor Cores: " << (backend.has_tensor_cores() ? "Yes" : "No") << std::endl; + std::cout << " Has INT8 Tensor Cores: " << (backend.has_int8_tensor_cores() ? "Yes" : "No") << std::endl; + std::cout << " Has Warp Shuffle: " << (backend.has_warp_shuffle() ? "Yes" : "No") << std::endl; + std::cout << " Has Cooperative Groups: " << (backend.has_cooperative_groups() ? "Yes" : "No") << std::endl; + + std::cout << " PASSED" << std::endl; + return true; +} + +// ============================================================================ +// Main Test Runner +// ============================================================================ + +int main() { + std::cout << "======================================" << std::endl; + std::cout << "CUDA Optimization Tests" << std::endl; + std::cout << "======================================" << std::endl; + + int passed = 0; + int failed = 0; + + // Memory tests + if (test_unified_memory()) passed++; else failed++; + if (test_pinned_memory()) passed++; else failed++; + if (test_double_buffer()) passed++; else failed++; + if (test_memory_pool()) passed++; else failed++; + + // Profiling tests + if (test_kernel_timer()) passed++; else failed++; + if (test_bandwidth_measurement()) passed++; else failed++; + + // Architecture tests + if (test_architecture_detection()) passed++; else failed++; + +#ifdef USE_CUDA_TENSOR_CORES + // Tensor core tests + if (test_tensor_core_availability()) passed++; else failed++; +#endif + + // Print summary + std::cout << "\n======================================" << std::endl; + std::cout << "Test Summary" << std::endl; + std::cout << "======================================" << std::endl; + std::cout << "Passed: " << passed << std::endl; + std::cout << "Failed: " << failed << std::endl; + std::cout << "Total: " << (passed + failed) << std::endl; + + if (failed == 0) { + std::cout << "\nAll tests PASSED! ✓" << std::endl; + } else { + std::cout << "\nSome tests FAILED! ✗" << std::endl; + } + + return (failed == 0) ? 0 : 1; +} + +#else // !USE_CUDA + +int main() { + std::cout << "CUDA support not enabled. Skipping tests." << std::endl; + return 0; +} + +#endif // USE_CUDA