From cb85412a280021c640cd8d7c8bbdced13cadb8d1 Mon Sep 17 00:00:00 2001 From: Yoshi_likes_e4 <104140648+pt13762104@users.noreply.github.com> Date: Sat, 16 Aug 2025 16:21:26 +0700 Subject: [PATCH 1/4] Add option to disable MMA on Turing --- CMakeLists.txt | 394 ++++++++++++++-------------------- ggml/src/ggml-cuda/common.cuh | 16 +- 2 files changed, 171 insertions(+), 239 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 36a2078e4c9fa..29959ee649393 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,266 +1,192 @@ -cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. -project("llama.cpp" C CXX) -include(CheckIncludeFileCXX) - -#set(CMAKE_WARN_DEPRECATED YES) -set(CMAKE_WARN_UNUSED_CLI YES) - -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - -if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") -endif() - -message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}") - -# Add path to modules -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") - -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) - -if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) - set(LLAMA_STANDALONE ON) +cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES + +find_package(CUDAToolkit) + +if (CUDAToolkit_FOUND) + message(STATUS "CUDA Toolkit found") + + if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + # native == GPUs available at build time + # 50 == Maxwell, lowest CUDA 12 standard + # 60 == P100, FP16 CUDA intrinsics + # 61 == Pascal, __dp4a instruction (per-byte integer dot product) + # 70 == V100, FP16 tensor cores + # 75 == Turing, int8 tensor cores + # 80 == Ampere, asynchronous data loading, faster tensor core instructions + # 86 == RTX 3000, needs CUDA v11.1 + # 89 == RTX 4000, needs CUDA v11.8 + # + # XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run + # XX-real == compile CUDA code as device code for this specific architecture + # no suffix == compile as both PTX and device code + # + # The default behavior for a non-native is to build virtual architectures as needed to cover all features needed + # for best performance and to also build real architectures for the most commonly used GPUs. + if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24") + set(CMAKE_CUDA_ARCHITECTURES "native") + elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) + if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8") + set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real") + else() + set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real") + endif() + else() + if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8") + set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real") + else() + set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real") + endif() + endif() + endif() + message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") - include(git-vars) + enable_language(CUDA) - # configure project version - # TODO -else() - set(LLAMA_STANDALONE OFF) -endif() + file(GLOB GGML_HEADERS_CUDA "*.cuh") + list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h") -option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF) + file(GLOB GGML_SOURCES_CUDA "*.cu") + file(GLOB SRCS "template-instances/fattn-mma*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "template-instances/mmq*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) -if (EMSCRIPTEN) - set(BUILD_SHARED_LIBS_DEFAULT OFF) - - option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON) -else() - if (MINGW) - set(BUILD_SHARED_LIBS_DEFAULT OFF) + if (GGML_CUDA_FA_ALL_QUANTS) + file(GLOB SRCS "template-instances/fattn-vec*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) else() - set(BUILD_SHARED_LIBS_DEFAULT ON) + file(GLOB SRCS "template-instances/fattn-vec*q4_0-q4_0.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "template-instances/fattn-vec*q8_0-q8_0.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "template-instances/fattn-vec*f16-f16.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) endif() -endif() -option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) - -if (WIN32) - add_compile_definitions(_CRT_SECURE_NO_WARNINGS) -endif() - -if (MSVC) - add_compile_options("$<$:/utf-8>") - add_compile_options("$<$:/utf-8>") - add_compile_options("$<$:/bigobj>") - add_compile_options("$<$:/bigobj>") -endif() - -# -# option list -# - -# debug -option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) -option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) - -# build -option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF) - -# sanitizers -option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) -option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) -option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) - -# utils -option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE}) - -# extra artifacts -option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) - -# 3rd party libs -option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) -option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF) - -# Required for relocatable CMake package -include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) -include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake) - -if (NOT DEFINED LLAMA_BUILD_NUMBER) - set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER}) -endif() -if (NOT DEFINED LLAMA_BUILD_COMMIT) - set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT}) -endif() -set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER}) + ggml_add_backend_library(ggml-cuda + ${GGML_HEADERS_CUDA} + ${GGML_SOURCES_CUDA} + ) -# override ggml options -set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) -set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) + add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) -# change the default for these ggml options -if (NOT DEFINED GGML_LLAMAFILE) - set(GGML_LLAMAFILE_DEFAULT ON) -endif() + if (GGML_CUDA_GRAPHS) + add_compile_definitions(GGML_CUDA_USE_GRAPHS) + endif() -if (NOT DEFINED GGML_CUDA_GRAPHS) - set(GGML_CUDA_GRAPHS_DEFAULT ON) -endif() + if (GGML_CUDA_FORCE_MMQ) + add_compile_definitions(GGML_CUDA_FORCE_MMQ) + endif() -# transition helpers -function (llama_option_depr TYPE OLD NEW) - if (${OLD}) - message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n") - set(${NEW} ON PARENT_SCOPE) + if (GGML_CUDA_FORCE_CUBLAS) + add_compile_definitions(GGML_CUDA_FORCE_CUBLAS) endif() -endfunction() - -llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA) -llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA) -llama_option_depr(WARNING LLAMA_METAL GGML_METAL) -llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY) -llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE) -llama_option_depr(WARNING LLAMA_RPC GGML_RPC) -llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) -llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) -llama_option_depr(WARNING LLAMA_CANN GGML_CANN) - -if (NOT MSVC) - if (LLAMA_SANITIZE_THREAD) - message(STATUS "Using -fsanitize=thread") - - add_compile_options(-fsanitize=thread) - link_libraries (-fsanitize=thread) + + if (GGML_CUDA_DISABLE_TURING_MMA) + add_compile_definitions(GGML_CUDA_DISABLE_TURING_MMA) endif() - if (LLAMA_SANITIZE_ADDRESS) - message(STATUS "Using -fsanitize=address") + if (GGML_CUDA_NO_VMM) + add_compile_definitions(GGML_CUDA_NO_VMM) + endif() - add_compile_options(-fsanitize=address -fno-omit-frame-pointer) - link_libraries (-fsanitize=address) + if (NOT GGML_CUDA_FA) + add_compile_definitions(GGML_CUDA_NO_FA) endif() - if (LLAMA_SANITIZE_UNDEFINED) - message(STATUS "Using -fsanitize=undefined") + if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) + add_compile_definitions(GGML_CUDA_F16) + endif() - add_compile_options(-fsanitize=undefined) - link_libraries (-fsanitize=undefined) + if (GGML_CUDA_NO_PEER_COPY) + add_compile_definitions(GGML_CUDA_NO_PEER_COPY) endif() -endif() -# -# 3rd-party -# + if (GGML_STATIC) + if (WIN32) + # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library + target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas) + else () + target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static) + endif() + else() + target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas) + endif() -if (LLAMA_USE_SYSTEM_GGML) - message(STATUS "Using system-provided libggml, skipping ggml build") - find_package(ggml REQUIRED) - add_library(ggml ALIAS ggml::ggml) -endif() + if (GGML_CUDA_NO_VMM) + # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) + else() + target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver) + endif() -if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML) - set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER}) - set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT}) - add_subdirectory(ggml) - # ... otherwise assume ggml is added by a parent CMakeLists.txt -endif() + set(CUDA_CXX_FLAGS "") -if (MINGW) - # Target Windows 8 for PrefetchVirtualMemory - add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) -endif() + set(CUDA_FLAGS -use_fast_math -extended-lambda) -# -# build the library -# + if (GGML_CUDA_DEBUG) + list(APPEND CUDA_FLAGS -lineinfo) + endif() -add_subdirectory(src) + if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8") + # Options are: + # - none (not recommended) + # - speed (nvcc's default) + # - balance + # - size + list(APPEND CUDA_FLAGS -compress-mode=${GGML_CUDA_COMPRESSION_MODE}) + endif() -# -# utils, programs, examples and tests -# + if (GGML_FATAL_WARNINGS) + list(APPEND CUDA_FLAGS -Werror all-warnings) + endif() -if (NOT LLAMA_BUILD_COMMON) - message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL") - set(LLAMA_CURL OFF) -endif() + if (GGML_ALL_WARNINGS AND NOT MSVC) + set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c) + if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "") + list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER}) + endif() + + execute_process( + COMMAND ${NVCC_CMD} -Xcompiler --version + OUTPUT_VARIABLE CUDA_CCFULLVER + ERROR_QUIET + ) + + if (NOT CUDA_CCFULLVER MATCHES clang) + set(CUDA_CCID "GNU") + execute_process( + COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion" + OUTPUT_VARIABLE CUDA_CCVER + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + else() + if (CUDA_CCFULLVER MATCHES Apple) + set(CUDA_CCID "AppleClang") + else() + set(CUDA_CCID "Clang") + endif() + string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER}) + endif() + + message(STATUS "CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") + + ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER}) + list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later + endif() -if (LLAMA_BUILD_COMMON) - add_subdirectory(common) -endif() + if (NOT MSVC) + list(APPEND CUDA_CXX_FLAGS -Wno-pedantic) + endif() -if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) - include(CTest) - add_subdirectory(tests) -endif() + list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument -if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES) - add_subdirectory(examples) - add_subdirectory(pocs) -endif() + if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") + list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED}) + endif() -if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS) - add_subdirectory(tools) + target_compile_options(ggml-cuda PRIVATE "$<$:${CUDA_FLAGS}>") +else() + message(FATAL_ERROR "CUDA Toolkit not found") endif() - -# -# install -# - -include(GNUInstallDirs) -include(CMakePackageConfigHelpers) - -set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files") -set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") -set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") - -set(LLAMA_PUBLIC_HEADERS - ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h - ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h) - -set_target_properties(llama - PROPERTIES - PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}") - -install(TARGETS llama LIBRARY PUBLIC_HEADER) - -configure_package_config_file( - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake - INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama - PATH_VARS LLAMA_INCLUDE_INSTALL_DIR - LLAMA_LIB_INSTALL_DIR - LLAMA_BIN_INSTALL_DIR ) - -write_basic_package_version_file( - ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake - VERSION ${LLAMA_INSTALL_VERSION} - COMPATIBILITY SameMajorVersion) - -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake - ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama) - -install( - FILES convert_hf_to_gguf.py - PERMISSIONS - OWNER_READ - OWNER_WRITE - OWNER_EXECUTE - GROUP_READ - GROUP_EXECUTE - WORLD_READ - WORLD_EXECUTE - DESTINATION ${CMAKE_INSTALL_BINDIR}) - -configure_file(cmake/llama.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" - @ONLY) - -install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" - DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 2b14b30ac90f3..f726d35d47bf8 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -224,7 +224,7 @@ typedef float2 dfloat2; #define FAST_FP16_AVAILABLE #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 -#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA) +#if ((!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)) && !defined(GGML_CUDA_DISABLE_TURING_MMA) #define FP16_MMA_AVAILABLE #endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA) @@ -236,7 +236,7 @@ typedef float2 dfloat2; #define AMD_MFMA_AVAILABLE #endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA) -#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING && !defined(GGML_CUDA_DISABLE_TURING_MMA) #define TURING_MMA_AVAILABLE #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING @@ -252,6 +252,12 @@ typedef float2 dfloat2; #define FLASH_ATTN_AVAILABLE #endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220) +#if defined(GGML_CUDA_DISABLE_TURING_MMA) +#define MMA_DISABLE (GGML_CUDA_CC_IS_NVIDIA(cc) && cc == 750) +#else +#define MMA_DISABLE 0 +#endif + static bool fp16_available(const int cc) { return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL; } @@ -271,7 +277,7 @@ static bool fp16_mma_available(const int cc) { #if defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN) return false; #else - if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || + if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && !MMA_DISABLE) || GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_MTHREADS(cc)) { return true; @@ -289,7 +295,7 @@ static bool fp16_mma_available(const int cc) { // To be used for feature selection of external libraries, e.g. cuBLAS. static bool fp16_mma_hardware_available(const int cc) { - return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || + return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA && !MMA_DISABLE) || GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) || (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); } @@ -312,7 +318,7 @@ static bool amd_mfma_available(const int cc) { // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later. static bool turing_mma_available(const int cc) { - return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING; + return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING && !MMA_DISABLE; } static bool ampere_mma_available(const int cc) { From 36201c68e4cb02b926240585b7b17aa7a0c2feb4 Mon Sep 17 00:00:00 2001 From: Yoshi_likes_e4 <104140648+pt13762104@users.noreply.github.com> Date: Sat, 16 Aug 2025 16:25:13 +0700 Subject: [PATCH 2/4] Revert wrong CMakeLists --- CMakeLists.txt | 394 ++++++++++++++++++------------ ggml/src/ggml-cuda/CMakeLists.txt | 4 + 2 files changed, 238 insertions(+), 160 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 29959ee649393..36a2078e4c9fa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,192 +1,266 @@ -cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES - -find_package(CUDAToolkit) - -if (CUDAToolkit_FOUND) - message(STATUS "CUDA Toolkit found") - - if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - # native == GPUs available at build time - # 50 == Maxwell, lowest CUDA 12 standard - # 60 == P100, FP16 CUDA intrinsics - # 61 == Pascal, __dp4a instruction (per-byte integer dot product) - # 70 == V100, FP16 tensor cores - # 75 == Turing, int8 tensor cores - # 80 == Ampere, asynchronous data loading, faster tensor core instructions - # 86 == RTX 3000, needs CUDA v11.1 - # 89 == RTX 4000, needs CUDA v11.8 - # - # XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run - # XX-real == compile CUDA code as device code for this specific architecture - # no suffix == compile as both PTX and device code - # - # The default behavior for a non-native is to build virtual architectures as needed to cover all features needed - # for best performance and to also build real architectures for the most commonly used GPUs. - if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24") - set(CMAKE_CUDA_ARCHITECTURES "native") - elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) - if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8") - set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real") - else() - set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real") - endif() - else() - if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8") - set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real") - else() - set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real") - endif() - endif() - endif() - message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") +cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. +project("llama.cpp" C CXX) +include(CheckIncludeFileCXX) + +#set(CMAKE_WARN_DEPRECATED YES) +set(CMAKE_WARN_UNUSED_CLI YES) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") +endif() + +message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}") + +# Add path to modules +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") - enable_language(CUDA) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) - file(GLOB GGML_HEADERS_CUDA "*.cuh") - list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h") +if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) + set(LLAMA_STANDALONE ON) - file(GLOB GGML_SOURCES_CUDA "*.cu") - file(GLOB SRCS "template-instances/fattn-mma*.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "template-instances/mmq*.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) + include(git-vars) - if (GGML_CUDA_FA_ALL_QUANTS) - file(GLOB SRCS "template-instances/fattn-vec*.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) + # configure project version + # TODO +else() + set(LLAMA_STANDALONE OFF) +endif() + +option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF) + +if (EMSCRIPTEN) + set(BUILD_SHARED_LIBS_DEFAULT OFF) + + option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON) +else() + if (MINGW) + set(BUILD_SHARED_LIBS_DEFAULT OFF) else() - file(GLOB SRCS "template-instances/fattn-vec*q4_0-q4_0.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "template-instances/fattn-vec*q8_0-q8_0.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "template-instances/fattn-vec*f16-f16.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) + set(BUILD_SHARED_LIBS_DEFAULT ON) endif() +endif() - ggml_add_backend_library(ggml-cuda - ${GGML_HEADERS_CUDA} - ${GGML_SOURCES_CUDA} - ) +option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) - add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) +if (WIN32) + add_compile_definitions(_CRT_SECURE_NO_WARNINGS) +endif() - if (GGML_CUDA_GRAPHS) - add_compile_definitions(GGML_CUDA_USE_GRAPHS) - endif() +if (MSVC) + add_compile_options("$<$:/utf-8>") + add_compile_options("$<$:/utf-8>") + add_compile_options("$<$:/bigobj>") + add_compile_options("$<$:/bigobj>") +endif() - if (GGML_CUDA_FORCE_MMQ) - add_compile_definitions(GGML_CUDA_FORCE_MMQ) - endif() +# +# option list +# - if (GGML_CUDA_FORCE_CUBLAS) - add_compile_definitions(GGML_CUDA_FORCE_CUBLAS) - endif() +# debug +option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) +option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) - if (GGML_CUDA_DISABLE_TURING_MMA) - add_compile_definitions(GGML_CUDA_DISABLE_TURING_MMA) - endif() +# build +option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF) - if (GGML_CUDA_NO_VMM) - add_compile_definitions(GGML_CUDA_NO_VMM) - endif() +# sanitizers +option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) +option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) +option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) - if (NOT GGML_CUDA_FA) - add_compile_definitions(GGML_CUDA_NO_FA) - endif() +# utils +option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE}) - if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) - add_compile_definitions(GGML_CUDA_F16) - endif() +# extra artifacts +option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) - if (GGML_CUDA_NO_PEER_COPY) - add_compile_definitions(GGML_CUDA_NO_PEER_COPY) - endif() +# 3rd party libs +option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) +option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF) - if (GGML_STATIC) - if (WIN32) - # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library - target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas) - else () - target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static) - endif() - else() - target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas) - endif() +# Required for relocatable CMake package +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake) - if (GGML_CUDA_NO_VMM) - # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) - else() - target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver) - endif() +if (NOT DEFINED LLAMA_BUILD_NUMBER) + set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER}) +endif() +if (NOT DEFINED LLAMA_BUILD_COMMIT) + set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT}) +endif() +set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER}) - set(CUDA_CXX_FLAGS "") +# override ggml options +set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) +set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) - set(CUDA_FLAGS -use_fast_math -extended-lambda) +# change the default for these ggml options +if (NOT DEFINED GGML_LLAMAFILE) + set(GGML_LLAMAFILE_DEFAULT ON) +endif() - if (GGML_CUDA_DEBUG) - list(APPEND CUDA_FLAGS -lineinfo) - endif() +if (NOT DEFINED GGML_CUDA_GRAPHS) + set(GGML_CUDA_GRAPHS_DEFAULT ON) +endif() - if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8") - # Options are: - # - none (not recommended) - # - speed (nvcc's default) - # - balance - # - size - list(APPEND CUDA_FLAGS -compress-mode=${GGML_CUDA_COMPRESSION_MODE}) +# transition helpers +function (llama_option_depr TYPE OLD NEW) + if (${OLD}) + message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n") + set(${NEW} ON PARENT_SCOPE) endif() - - if (GGML_FATAL_WARNINGS) - list(APPEND CUDA_FLAGS -Werror all-warnings) +endfunction() + +llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA) +llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA) +llama_option_depr(WARNING LLAMA_METAL GGML_METAL) +llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY) +llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE) +llama_option_depr(WARNING LLAMA_RPC GGML_RPC) +llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) +llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) +llama_option_depr(WARNING LLAMA_CANN GGML_CANN) + +if (NOT MSVC) + if (LLAMA_SANITIZE_THREAD) + message(STATUS "Using -fsanitize=thread") + + add_compile_options(-fsanitize=thread) + link_libraries (-fsanitize=thread) endif() - if (GGML_ALL_WARNINGS AND NOT MSVC) - set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c) - if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "") - list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER}) - endif() - - execute_process( - COMMAND ${NVCC_CMD} -Xcompiler --version - OUTPUT_VARIABLE CUDA_CCFULLVER - ERROR_QUIET - ) - - if (NOT CUDA_CCFULLVER MATCHES clang) - set(CUDA_CCID "GNU") - execute_process( - COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion" - OUTPUT_VARIABLE CUDA_CCVER - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - else() - if (CUDA_CCFULLVER MATCHES Apple) - set(CUDA_CCID "AppleClang") - else() - set(CUDA_CCID "Clang") - endif() - string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER}) - endif() - - message(STATUS "CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") - - ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER}) - list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later - endif() + if (LLAMA_SANITIZE_ADDRESS) + message(STATUS "Using -fsanitize=address") - if (NOT MSVC) - list(APPEND CUDA_CXX_FLAGS -Wno-pedantic) + add_compile_options(-fsanitize=address -fno-omit-frame-pointer) + link_libraries (-fsanitize=address) endif() - list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument + if (LLAMA_SANITIZE_UNDEFINED) + message(STATUS "Using -fsanitize=undefined") - if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") - list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED}) + add_compile_options(-fsanitize=undefined) + link_libraries (-fsanitize=undefined) endif() +endif() - target_compile_options(ggml-cuda PRIVATE "$<$:${CUDA_FLAGS}>") -else() - message(FATAL_ERROR "CUDA Toolkit not found") +# +# 3rd-party +# + +if (LLAMA_USE_SYSTEM_GGML) + message(STATUS "Using system-provided libggml, skipping ggml build") + find_package(ggml REQUIRED) + add_library(ggml ALIAS ggml::ggml) +endif() + +if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML) + set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER}) + set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT}) + add_subdirectory(ggml) + # ... otherwise assume ggml is added by a parent CMakeLists.txt +endif() + +if (MINGW) + # Target Windows 8 for PrefetchVirtualMemory + add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) endif() + +# +# build the library +# + +add_subdirectory(src) + +# +# utils, programs, examples and tests +# + +if (NOT LLAMA_BUILD_COMMON) + message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL") + set(LLAMA_CURL OFF) +endif() + +if (LLAMA_BUILD_COMMON) + add_subdirectory(common) +endif() + +if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) + include(CTest) + add_subdirectory(tests) +endif() + +if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES) + add_subdirectory(examples) + add_subdirectory(pocs) +endif() + +if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS) + add_subdirectory(tools) +endif() + +# +# install +# + +include(GNUInstallDirs) +include(CMakePackageConfigHelpers) + +set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files") +set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") +set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") + +set(LLAMA_PUBLIC_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h + ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h) + +set_target_properties(llama + PROPERTIES + PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}") + +install(TARGETS llama LIBRARY PUBLIC_HEADER) + +configure_package_config_file( + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama + PATH_VARS LLAMA_INCLUDE_INSTALL_DIR + LLAMA_LIB_INSTALL_DIR + LLAMA_BIN_INSTALL_DIR ) + +write_basic_package_version_file( + ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake + VERSION ${LLAMA_INSTALL_VERSION} + COMPATIBILITY SameMajorVersion) + +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake + ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama) + +install( + FILES convert_hf_to_gguf.py + PERMISSIONS + OWNER_READ + OWNER_WRITE + OWNER_EXECUTE + GROUP_READ + GROUP_EXECUTE + WORLD_READ + WORLD_EXECUTE + DESTINATION ${CMAKE_INSTALL_BINDIR}) + +configure_file(cmake/llama.pc.in + "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" + @ONLY) + +install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" + DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index bce07ac362830..29959ee649393 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -83,6 +83,10 @@ if (CUDAToolkit_FOUND) add_compile_definitions(GGML_CUDA_FORCE_CUBLAS) endif() + if (GGML_CUDA_DISABLE_TURING_MMA) + add_compile_definitions(GGML_CUDA_DISABLE_TURING_MMA) + endif() + if (GGML_CUDA_NO_VMM) add_compile_definitions(GGML_CUDA_NO_VMM) endif() From 6d7ef1503be3c12c1612b8348bff9d987f899260 Mon Sep 17 00:00:00 2001 From: Yoshi_likes_e4 <104140648+pt13762104@users.noreply.github.com> Date: Sat, 16 Aug 2025 20:08:04 +0700 Subject: [PATCH 3/4] Recover FP16 performance and improve general performance --- ggml/src/ggml-cuda/common.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index f726d35d47bf8..ada5c0af941a2 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -268,7 +268,7 @@ static bool fast_fp16_available(const int cc) { // To be used for feature selection of external libraries, e.g. cuBLAS. static bool fast_fp16_hardware_available(const int cc) { - return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) || + return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610 && !MMA_DISABLE) || GGML_CUDA_CC_IS_AMD(cc) || (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); } From cf492dd23004e3b3f7349e04a83c40ff4321b62c Mon Sep 17 00:00:00 2001 From: Yoshi_likes_e4 <104140648+pt13762104@users.noreply.github.com> Date: Tue, 19 Aug 2025 09:44:54 +0700 Subject: [PATCH 4/4] Fix comments --- ggml/src/ggml-cuda/common.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index ada5c0af941a2..c7995d643c829 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -226,7 +226,7 @@ typedef float2 dfloat2; #if ((!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)) && !defined(GGML_CUDA_DISABLE_TURING_MMA) #define FP16_MMA_AVAILABLE -#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA) +#endif // ((!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)) && !defined(GGML_CUDA_DISABLE_TURING_MMA) #if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4))) #define FP16_MMA_AVAILABLE @@ -238,7 +238,7 @@ typedef float2 dfloat2; #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING && !defined(GGML_CUDA_DISABLE_TURING_MMA) #define TURING_MMA_AVAILABLE -#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING +#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING && !defined(GGML_CUDA_DISABLE_TURING_MMA) #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #define AMPERE_MMA_AVAILABLE