Skip to content

Commit 46876d2

Browse files
authored
cuda : supports running on CPU for GGML_USE_CUBLAS=ON build (ggml-org#3946)
* protyping the idea that supports running on CPU for a GGML_USE_CUBLAS=on build * doc: add comments to ggml_cublas_loaded() * fix defined(...)
1 parent 381efbf commit 46876d2

File tree

3 files changed

+126
-75
lines changed

3 files changed

+126
-75
lines changed

ggml-cuda.cu

+16-1
Original file line numberDiff line numberDiff line change
@@ -5790,6 +5790,11 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
57905790
CUDA_CHECK(cudaFree(ptr));
57915791
}
57925792

5793+
static bool g_cublas_loaded = false;
5794+
5795+
bool ggml_cublas_loaded(void) {
5796+
return g_cublas_loaded;
5797+
}
57935798

57945799
void ggml_init_cublas() {
57955800
static bool initialized = false;
@@ -5803,7 +5808,12 @@ void ggml_init_cublas() {
58035808
CUDA_CHECK(cudaDeviceSynchronize());
58045809
#endif
58055810

5806-
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
5811+
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
5812+
initialized = true;
5813+
g_cublas_loaded = false;
5814+
return;
5815+
}
5816+
58075817
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
58085818
int64_t total_vram = 0;
58095819
#if defined(GGML_CUDA_FORCE_MMQ)
@@ -5851,6 +5861,7 @@ void ggml_init_cublas() {
58515861
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
58525862

58535863
initialized = true;
5864+
g_cublas_loaded = true;
58545865
}
58555866
}
58565867

@@ -7158,6 +7169,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
71587169
}
71597170

71607171
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
7172+
if (!g_cublas_loaded) return false;
7173+
71617174
const int64_t ne10 = src1->ne[0];
71627175

71637176
const int64_t ne0 = dst->ne[0];
@@ -7843,6 +7856,8 @@ void ggml_cuda_free_scratch() {
78437856
}
78447857

78457858
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
7859+
if (!g_cublas_loaded) return false;
7860+
78467861
ggml_cuda_func_t func;
78477862
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
78487863
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))

ggml-cuda.h

+5
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,12 @@ extern "C" {
1717

1818
#define GGML_CUDA_MAX_DEVICES 16
1919

20+
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
2021
GGML_API void ggml_init_cublas(void);
22+
23+
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
24+
GGML_API bool ggml_cublas_loaded(void);
25+
2126
GGML_API void * ggml_cuda_host_malloc(size_t size);
2227
GGML_API void ggml_cuda_host_free(void * ptr);
2328

0 commit comments

Comments
 (0)