@@ -329,7 +329,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
329329 } else
330330#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
331331 {
332- CUDA_CHECK (cudaMemcpyAsync (src1_ddc, src0_ddc, ggml_nbytes (src0), cudaMemcpyDeviceToDevice, main_stream));
332+ if (src0->type == GGML_TYPE_F32) {
333+ ggml_cpy_flt_cuda<float , float > (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
334+ } else {
335+ CUDA_CHECK (cudaMemcpyAsync (src1_ddc, src0_ddc, ggml_nbytes (src0), cudaMemcpyDeviceToDevice, main_stream));
336+ }
333337 }
334338 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
335339 ggml_cpy_flt_cuda<float , float > (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
@@ -400,7 +404,13 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
400404
401405void * ggml_cuda_cpy_fn (const ggml_tensor * src0, ggml_tensor * src1) {
402406 if (src0->type == src1->type && ggml_is_contiguous (src0) && ggml_is_contiguous (src1)) {
403- return nullptr ;
407+ // Prioritize CUDA graph compatibility over direct memory copy optimization.
408+ // Using copy kernels here maintains graph indirection support, preventing performance regression from disabled CUDA graphs.
409+ if (src0->type == GGML_TYPE_F32) {
410+ return (void *) cpy_flt<cpy_1_flt<float , float >>;
411+ } else {
412+ return nullptr ;
413+ }
404414 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
405415 return (void *) cpy_flt<cpy_1_flt<float , float >>;
406416 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
0 commit comments