Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ option(GGML_CUDA "ggml: use CUDA"
option(GGML_MUSA "ggml: use MUSA" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
option(GGML_CUDA_NO_TURING_MMA "ggml: disable the use of mma in mmq kernels" OFF)

set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"ggml: max. batch size for using peer access")
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
Expand Down
5 changes: 5 additions & 0 deletions ggml/src/ggml-cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ if (CUDAToolkit_FOUND)
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
endif()

if (GGML_CUDA_NO_TURING_MMA)
add_compile_definitions(GGML_CUDA_NO_TURING_MMA)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()

if (GGML_CUDA_NO_VMM)
add_compile_definitions(GGML_CUDA_NO_VMM)
endif()
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@
#include <string>
#include <vector>

#ifdef GGML_CUDA_NO_TURING_MMA
#define CUBLAS_COMPUTE_16F CUBLAS_COMPUTE_16F_PEDANTIC
#endif

static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");

[[noreturn]]
Expand Down
3 changes: 3 additions & 0 deletions ggml/src/ggml-cuda/mmq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@

using namespace ggml_cuda_mma;

#ifdef GGML_CUDA_NO_TURING_MMA
#undef TURING_MMA_AVAILABLE
#endif
#define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
#define MMQ_ITER_K 256
#define MMQ_NWARPS 8
Expand Down