Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,13 @@ if(BUILD_CUDA)
message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...")

# 11.4+ supports these at a minimum.
set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87)
set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80)
if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "13.0")
set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87)
set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80)
else()
set(CMAKE_CUDA_ARCHITECTURES_ALL 75 80 86 87)
set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 70 80)
endif()

# CUDA 11.8 adds support for Ada and Hopper.
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8")
Expand Down
25 changes: 13 additions & 12 deletions csrc/kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_store.cuh>
#include <cub/cub.cuh>
#include <cuda/std/functional>
#include <cub/warp/warp_reduce.cuh>
#include <cuda_fp16.h>
#include <math_constants.h>
Expand Down Expand Up @@ -416,7 +417,7 @@ __global__ void kQuantizeBlockwise(
for (int j = 0; j < NUM_PER_TH; j++)
local_abs_max = fmaxf(local_abs_max, fabsf((float)vals[j]));

local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, cub::Max(), valid_items);
local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, cuda::maximum<>{}, valid_items);

if (threadIdx.x == 0) {
smem_absmax_value[0] = 1.0f / local_abs_max;
Expand Down Expand Up @@ -1002,12 +1003,12 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b
}

__syncthreads();
local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cub::Max(), valid_items);
local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cuda::maximum<>{}, valid_items);
__syncthreads();
local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, cub::Max(), valid_items);
local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, cuda::maximum<>{}, valid_items);
if (unorm != NULL) {
__syncthreads();
local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cub::Sum(), valid_items);
local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cuda::std::plus<>{}, valid_items);
}

if (threadIdx.x == 0) {
Expand Down Expand Up @@ -1213,13 +1214,13 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b
}

__syncthreads();
local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cub::Max(), valid_items);
local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cuda::maximum<>{}, valid_items);
if (threadIdx.x == 0) {
atomicMax(&new_max1[0], local_max_s1);
}
if (unorm != NULL) {
__syncthreads();
local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cub::Sum(), valid_items);
local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cuda::std::plus<>{}, valid_items);
if (threadIdx.x == 0) {
atomicAdd(&unorm[0], local_unorm);
}
Expand Down Expand Up @@ -1524,11 +1525,11 @@ __launch_bounds__(256, 3) __global__ void kOptimizerStatic8bit2StateBlockwise(
}

// reduce: 2.51/1.60 -> 2.67/1.69
new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cub::Max());
new_local_abs_max2 = BlockReduce2(reduce2).Reduce(new_local_abs_max2, cub::Max());
new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cuda::maximum<>{});
new_local_abs_max2 = BlockReduce2(reduce2).Reduce(new_local_abs_max2, cuda::maximum<>{});

if (OPTIMIZER == ADEMAMIX) {
new_local_abs_max3 = BlockReduce3(reduce3).Reduce(new_local_abs_max3, cub::Max());
new_local_abs_max3 = BlockReduce3(reduce3).Reduce(new_local_abs_max3, cuda::maximum<>{});
}

if (threadIdx.x == 0) {
Expand Down Expand Up @@ -1737,7 +1738,7 @@ __launch_bounds__(256, 3) __global__ void kOptimizerStatic8bit1StateBlockwise(
}

// reduce: 2.51/1.60 -> 2.67/1.69
new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cub::Max());
new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cuda::maximum<>{});

if (threadIdx.x == 0)
smem_exchange1[0] = new_local_abs_max1;
Expand Down Expand Up @@ -1843,7 +1844,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__
}

// Reduce thread-local absmax across the block.
const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cub::Max(), cols);
const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cuda::maximum<>{}, cols);
if (threadIdx.x == 0) {
// Save our block's absmax to shared memory for the quantization step.
rowStats[row_id] = smem_row_absmax = row_absmax;
Expand Down Expand Up @@ -1898,7 +1899,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__

// Reduce thread-local absmax across the block.
// TODO: Consider algorithm BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cub::Max(), cols);
const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cuda::maximum<>{}, cols);
if (threadIdx.x == 0) {
// Save our block's absmax to shared memory for the quantization step.
rowStats[row_id] = row_absmax;
Expand Down
23 changes: 22 additions & 1 deletion csrc/pythonInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,27 @@

#if BUILD_CUDA
#include <ops.cuh>
#include <cuda_runtime_api.h>

#if CUDART_VERSION >= 13000
static inline cudaError_t bnb_cudaMemPrefetchAsync(const void* ptr,
size_t bytes,
int device,
cudaStream_t stream) {
cudaMemLocation loc{};
loc.type = cudaMemLocationTypeDevice;
loc.id = device;
// flags = 0
return cudaMemPrefetchAsync(ptr, bytes, loc, 0u, stream);
}
#else
static inline cudaError_t bnb_cudaMemPrefetchAsync(const void* ptr,
size_t bytes,
int device,
cudaStream_t stream) {
return cudaMemPrefetchAsync(ptr, bytes, device, stream);
}
#endif
#endif
#if BUILD_HIP
#include <ops_hip.cuh>
Expand Down Expand Up @@ -623,7 +644,7 @@ void cprefetch(void* ptr, size_t bytes, int device) {
if (hasPrefetch == 0)
return;

CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0));
CUDA_CHECK_RETURN(bnb_cudaMemPrefetchAsync(ptr, bytes, device, 0));
CUDA_CHECK_RETURN(cudaPeekAtLastError());
}

Expand Down
Loading