Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 46 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,11 @@ endif()
# CUDA Configuration
# ============================================================================
if(USE_CUDA)
# CUDA optimization options
option(CUDA_TENSOR_CORES "Enable tensor core kernels (Volta SM 7.0+)" ON)
option(CUDA_PROFILING "Enable NVTX profiling markers" OFF)
option(CUDA_WARP_PRIMITIVES "Enable warp-level primitive optimizations" ON)

# Find CUDA toolkit
find_package(CUDAToolkit QUIET)

Expand Down Expand Up @@ -164,6 +169,24 @@ if(USE_CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 --use_fast_math")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fPIC")

# Enable tensor core support
if(CUDA_TENSOR_CORES)
add_definitions(-DUSE_CUDA_TENSOR_CORES)
message(STATUS " Tensor Cores: ENABLED")
endif()

# Enable NVTX profiling
if(CUDA_PROFILING)
add_definitions(-DUSE_NVTX)
message(STATUS " NVTX Profiling: ENABLED")
endif()

# Enable warp primitives
if(CUDA_WARP_PRIMITIVES)
add_definitions(-DUSE_CUDA_WARP_PRIMITIVES)
message(STATUS " Warp Primitives: ENABLED")
endif()

# Enable separable compilation for device code
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
Expand Down Expand Up @@ -221,6 +244,28 @@ set(GPU_SOURCES src/gpu/gpu_nnue_integration.cpp src/gpu/gpu_mcts_backend.cpp

# CUDA source files
set(CUDA_SOURCES "")
if(USE_CUDA AND CUDA_AVAILABLE)
set(CUDA_SOURCES
src/gpu/cuda/cuda_backend.cu
src/gpu/cuda/cuda_memory.cu
src/gpu/cuda/kernels/nnue_kernels.cu)

# Add advanced optimization kernels if enabled
if(CUDA_WARP_PRIMITIVES)
list(APPEND CUDA_SOURCES src/gpu/cuda/kernels/nnue_simd.cu)
endif()

if(CUDA_TENSOR_CORES)
list(APPEND CUDA_SOURCES src/gpu/cuda/kernels/nnue_tensor_core.cu)
endif()

# Add advanced features
list(APPEND CUDA_SOURCES
src/gpu/cuda/cuda_graphs.cu
src/gpu/cuda/cuda_multi_gpu.cu
src/gpu/cuda/cuda_fp16_weights.cu
src/gpu/cuda/kernels/nnue_persistent.cu)
endif()

# MCTS source files (hybrid search) Core files needed for all MCTS modes: -
# stockfish_adapter: Interface between Stockfish and MCTS - position_classifier:
Expand Down Expand Up @@ -265,10 +310,9 @@ if(USE_METAL AND METAL_CPP_AVAILABLE)
endif()
elseif(USE_CUDA AND CUDA_AVAILABLE)
# CUDA GPU acceleration
set(CUDA_SOURCES src/gpu/cuda/cuda_backend.cu
src/gpu/cuda/kernels/nnue_kernels.cu)
add_definitions(-DUSE_CUDA)
message(STATUS "CUDA GPU acceleration: ENABLED")
message(STATUS " CUDA source files: ${CUDA_SOURCES}")
else()
# CPU fallback backend
set(GPU_SOURCES ${GPU_SOURCES} src/gpu/cpu_backend.cpp)
Expand Down
73 changes: 64 additions & 9 deletions src/gpu/cuda/cuda_backend.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#ifdef USE_CUDA

#include "cuda_backend.h"
#include "cuda_memory.h"
#include "cuda_profiling.h"
#include <atomic>
#include <cstring>
#include <cuda_runtime.h>
Expand Down Expand Up @@ -88,11 +90,11 @@ CUDABuffer::CUDABuffer(void *device_ptr, void *host_ptr, size_t size,
CUDABuffer::~CUDABuffer() {
if (device_ptr_) {
if (unified_) {
cudaFree(device_ptr_);
CUDA::UnifiedMemoryManager::free_unified(device_ptr_);
} else {
cudaFree(device_ptr_);
if (host_ptr_) {
cudaFreeHost(host_ptr_);
CUDA::PinnedMemoryManager::free_pinned(host_ptr_);
}
}
}
Expand Down Expand Up @@ -263,9 +265,11 @@ void CUDACommandEncoder::barrier() { cudaStreamSynchronize(stream_); }
CUDABackend::CUDABackend()
: device_id_(-1), compute_capability_major_(0),
compute_capability_minor_(0), total_memory_(0), multiprocessor_count_(0),
unified_memory_supported_(false), default_stream_(nullptr),
unified_memory_supported_(false), tensor_cores_available_(false),
int8_tensor_cores_available_(false), default_stream_(nullptr),
stream_index_(0), allocated_memory_(0), peak_memory_(0),
initialized_(false) {}
initialized_(false), use_cuda_graphs_(false), use_multi_gpu_(false),
use_persistent_kernels_(false), use_fp16_weights_(false) {}

CUDABackend::~CUDABackend() { cleanup(); }

Expand Down Expand Up @@ -333,6 +337,9 @@ bool CUDABackend::initialize() {
cudaStreamCreate(&parallel_streams_[i]);
}

// Detect architecture-specific features
detect_architecture_features();

initialized_ = true;

std::cout << "[CUDA Backend] Initialized: " << device_name_ << std::endl;
Expand All @@ -345,10 +352,51 @@ bool CUDABackend::initialize() {
<< std::endl;
std::cout << "[CUDA Backend] Unified Memory: "
<< (unified_memory_supported_ ? "Yes" : "No") << std::endl;
std::cout << "[CUDA Backend] Tensor Cores: "
<< (tensor_cores_available_ ? "Yes" : "No") << std::endl;
if (tensor_cores_available_) {
std::cout << "[CUDA Backend] INT8 Tensor Cores: "
<< (int8_tensor_cores_available_ ? "Yes" : "No") << std::endl;
}

return true;
}

void CUDABackend::detect_architecture_features() {
// Detect tensor core support
// Volta (SM 7.0) and later have FP16 tensor cores
tensor_cores_available_ = compute_capability_major_ >= 7;

// Turing (SM 7.5) and later have INT8 tensor cores
this->int8_tensor_cores_available_ = (compute_capability_major_ > 7) ||
(compute_capability_major_ == 7 &&
compute_capability_minor_ >= 5);

// Print architecture-specific information
std::string arch_name;
if (compute_capability_major_ == 6 && compute_capability_minor_ == 0) {
arch_name = "Pascal (GP100)";
} else if (compute_capability_major_ == 6 && compute_capability_minor_ == 1) {
arch_name = "Pascal (GP10x)";
} else if (compute_capability_major_ == 7 && compute_capability_minor_ == 0) {
arch_name = "Volta";
} else if (compute_capability_major_ == 7 && compute_capability_minor_ == 5) {
arch_name = "Turing";
} else if (compute_capability_major_ == 8 && compute_capability_minor_ == 0) {
arch_name = "Ampere (A100)";
} else if (compute_capability_major_ == 8 && compute_capability_minor_ == 6) {
arch_name = "Ampere (GA10x)";
} else if (compute_capability_major_ == 8 && compute_capability_minor_ == 9) {
arch_name = "Ada Lovelace";
} else if (compute_capability_major_ == 9 && compute_capability_minor_ == 0) {
arch_name = "Hopper";
} else {
arch_name = "Unknown";
}

std::cout << "[CUDA Backend] Architecture: " << arch_name << std::endl;
}

void CUDABackend::cleanup() {
if (!initialized_) {
return;
Expand Down Expand Up @@ -462,11 +510,17 @@ std::unique_ptr<Buffer> CUDABackend::create_buffer(size_t size, MemoryMode mode,
bool unified = false;

if (mode == MemoryMode::Shared && unified_memory_supported_) {
// Use unified memory
cudaError_t err = cudaMallocManaged(&device_ptr, size);
if (err != cudaSuccess) {
// Use optimized unified memory with hints
device_ptr = CUDA::UnifiedMemoryManager::allocate_unified(size, device_id_);
if (!device_ptr) {
return nullptr;
}

// For read-only buffers (like weights), use read-mostly hint
if (usage == BufferUsage::Static) {
cudaMemAdvise(device_ptr, size, cudaMemAdviseSetReadMostly, device_id_);
}

unified = true;
} else {
// Allocate device and host memory separately
Expand All @@ -476,8 +530,9 @@ std::unique_ptr<Buffer> CUDABackend::create_buffer(size_t size, MemoryMode mode,
}

if (mode != MemoryMode::Private) {
err = cudaMallocHost(&host_ptr, size);
if (err != cudaSuccess) {
// Use pinned memory for faster transfers
host_ptr = CUDA::PinnedMemoryManager::allocate_pinned(size);
if (!host_ptr) {
cudaFree(device_ptr);
return nullptr;
}
Expand Down
26 changes: 26 additions & 0 deletions src/gpu/cuda/cuda_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,13 +169,31 @@ class CUDABackend : public Backend {
int compute_capability_minor() const { return compute_capability_minor_; }
size_t total_memory() const { return total_memory_; }
int multiprocessor_count() const { return multiprocessor_count_; }
bool has_tensor_cores() const { return tensor_cores_available_; }
bool has_int8_tensor_cores() const { return int8_tensor_cores_available_; }
bool has_warp_shuffle() const { return compute_capability_major_ >= 3; }
bool has_cooperative_groups() const { return compute_capability_major_ >= 6; }

// Advanced feature support
void enable_cuda_graphs(bool enable) { use_cuda_graphs_ = enable; }
bool is_cuda_graphs_enabled() const { return use_cuda_graphs_; }

void enable_multi_gpu(bool enable) { use_multi_gpu_ = enable; }
bool is_multi_gpu_enabled() const { return use_multi_gpu_; }

void enable_persistent_kernels(bool enable) { use_persistent_kernels_ = enable; }
bool is_persistent_kernels_enabled() const { return use_persistent_kernels_; }

void enable_fp16_weights(bool enable) { use_fp16_weights_ = enable; }
bool is_fp16_weights_enabled() const { return use_fp16_weights_; }

private:
CUDABackend();
~CUDABackend();

bool initialize();
void cleanup();
void detect_architecture_features();

int device_id_;
std::string device_name_;
Expand All @@ -184,6 +202,14 @@ class CUDABackend : public Backend {
size_t total_memory_;
int multiprocessor_count_;
bool unified_memory_supported_;
bool tensor_cores_available_;
bool int8_tensor_cores_available_;

// Feature flags
bool use_cuda_graphs_;
bool use_multi_gpu_;
bool use_persistent_kernels_;
bool use_fp16_weights_;

cudaStream_t default_stream_;
std::vector<cudaStream_t> parallel_streams_;
Expand Down
125 changes: 125 additions & 0 deletions src/gpu/cuda/cuda_fp16_weights.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/*
MetalFish - A GPU-accelerated UCI chess engine
Copyright (C) 2025 Nripesh Niketan

FP16 Weight Storage Implementation
*/

#ifdef USE_CUDA

#include "cuda_fp16_weights.h"
#include <iostream>
#include <unordered_map>
#include <string>

namespace MetalFish {
namespace GPU {
namespace CUDA {

FP16WeightManager::~FP16WeightManager() {
clear_all();
}

half* FP16WeightManager::convert_and_store_weights(
const int16_t* int16_weights, size_t size, float scale) {

// Allocate host memory for FP16 conversion
std::vector<half> fp16_host(size);

// Convert INT16 to FP16
for (size_t i = 0; i < size; i++) {
float val = static_cast<float>(int16_weights[i]) / scale;
fp16_host[i] = __float2half(val);
}

// Allocate device memory
half* device_ptr = nullptr;
cudaError_t err = cudaMalloc(&device_ptr, size * sizeof(half));
if (err != cudaSuccess) {
std::cerr << "[FP16 Weights] Failed to allocate device memory: "
<< cudaGetErrorString(err) << std::endl;
return nullptr;
}

// Copy to device
err = cudaMemcpy(device_ptr, fp16_host.data(), size * sizeof(half),
cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "[FP16 Weights] Failed to copy to device: "
<< cudaGetErrorString(err) << std::endl;
cudaFree(device_ptr);
return nullptr;
}

total_memory_ += size * sizeof(half);
return device_ptr;
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FP16WeightManager never stores allocated pointers in maps

Medium Severity

The convert_and_store_weights and convert_and_store_biases functions allocate device memory and return the pointer, but never add entries to the weights_ or biases_ maps. This means get_fp16_weights/get_fp16_biases will never find these allocations, and clear_all() (called in the destructor) won't free them, causing memory leaks.

Additional Locations (1)

Fix in Cursor Fix in Web


half* FP16WeightManager::convert_and_store_biases(
const int32_t* int32_biases, size_t size, float scale) {

// Allocate host memory for FP16 conversion
std::vector<half> fp16_host(size);

// Convert INT32 to FP16
for (size_t i = 0; i < size; i++) {
float val = static_cast<float>(int32_biases[i]) / scale;
fp16_host[i] = __float2half(val);
}

// Allocate device memory
half* device_ptr = nullptr;
cudaError_t err = cudaMalloc(&device_ptr, size * sizeof(half));
if (err != cudaSuccess) {
std::cerr << "[FP16 Biases] Failed to allocate device memory: "
<< cudaGetErrorString(err) << std::endl;
return nullptr;
}

// Copy to device
err = cudaMemcpy(device_ptr, fp16_host.data(), size * sizeof(half),
cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "[FP16 Biases] Failed to copy to device: "
<< cudaGetErrorString(err) << std::endl;
cudaFree(device_ptr);
return nullptr;
}

total_memory_ += size * sizeof(half);
return device_ptr;
}

half* FP16WeightManager::get_fp16_weights(const std::string& layer_name) {
auto it = weights_.find(layer_name);
return (it != weights_.end()) ? it->second.device_ptr : nullptr;
}

half* FP16WeightManager::get_fp16_biases(const std::string& layer_name) {
auto it = biases_.find(layer_name);
return (it != biases_.end()) ? it->second.device_ptr : nullptr;
}

void FP16WeightManager::clear_all() {
for (auto& [name, data] : weights_) {
if (data.device_ptr) {
cudaFree(data.device_ptr);
}
}

for (auto& [name, data] : biases_) {
if (data.device_ptr) {
cudaFree(data.device_ptr);
}
}

weights_.clear();
biases_.clear();
total_memory_ = 0;
}

} // namespace CUDA
} // namespace GPU
} // namespace MetalFish

#endif // USE_CUDA
Loading
Loading