-
Notifications
You must be signed in to change notification settings - Fork 82
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
MATLAB call cuFINUFFT #432
Comments
No, sorry, there is no matlab interface to cufinufft ... but it is on the
list of things to do. There may be a workaround using matlab's ability to
call python functions (py.*), but I have not seen a way to do that with
pycuda.
A proper matlab cufinufft interface would pass gpuarrays instead of copying
from H2D and D2H each transform (Is that what you're looking for?). I am
not sure if we can use mwrap (our MEX interface generator) any more. Looks
like we need CudaKernel instead:
https://www.mathworks.com/help/parallel-computing/run-cuda-or-ptx-code-on-gpu.html
…On Sun, Apr 21, 2024 at 10:28 PM zengletian1491 ***@***.***> wrote:
Can we use MATLAB to call functions which are associated with cuFINUFFT
?Thanks!
—
Reply to this email directly, view it on GitHub
<#432>, or unsubscribe
<https://github.com/notifications/unsubscribe-auth/ACNZRSUF2RAMNJ52WPF77DLY6RYVNAVCNFSM6AAAAABGR5GVBKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGI2TKNBUHE4TQNI>
.
You are receiving this because you are subscribed to this thread.Message
ID: ***@***.***>
--
*-------------------------------------------------------------------~^`^~._.~'
|\ Alex Barnett Center for Computational Mathematics, Flatiron Institute
| \ http://users.flatironinstitute.org/~ahb 646-876-5942
|
In case we need to implement matlab cufinufft interface soon, post here as a future reference. matab code: %%% compile mex cuda
% matlab version
setenv('CUDAPATH', '/usr/local/cuda-12.3'); % replace with your actual CUDA path
% mexcuda('-v', 'mexGPUlapslp.cu', ...
mexcuda('mexGPUlapslp.cu', ...
'-largeArrayDims', ...
['NVCCFLAGS="-gencode=arch=compute_80,code=sm_80"'], ...
['-I' getenv('MATLABROOT') '/extern/include'], ...
['-I' getenv('MATLABROOT') '/simulink/include'], ...
['-I' getenv('MATLABROOT') '/toolbox/parallel/gpu/extern/include/'], ...
['-L' getenv('MATLABROOT') '/bin/glnxa64'], ...
['-L' getenv('CUDAPATH') '/targets/x86_64-linux/lib'], ...
['-I' getenv('CUDAPATH') '/include'], ...
'-lcublas', ...
'-lcusparse', ... % add any other required CUDA libraries
'-lmwgpu', ... % add any other required MATLAB GPU libraries
'-lmx', '-lmex', '-lmat', '-lm'); % MATLAB libraries
%
N = 20000;
M = 30000;
srcx = rand(1,N); srcy = rand(1,N); srcz = rand(1,N);
targx = rand(1,M); targy = rand(1,M); targz = rand(1,M);
x = rand(1,N);
% run once
y = mexGPUlapslp([srcx;srcy;srcz],[targx;targy;targz],x);
% then time
tic,
y = mexGPUlapslp([srcx;srcy;srcz],[targx;targy;targz],x);
% A = reshape(A(:),N,M)';
toc
%
tic,
A2 = 1./sqrt((srcx - targx(:)).^2+(srcy - targy(:)).^2+(srcz - targz(:)).^2);
y2 = A2*x(:);
toc
%
diff = abs(y-y2)/max(abs(y)); max(diff)
% diffA = abs(A-A2); max(diffA(:)) The following cuda mexFunction file 'mexGPUlapslp.cu' which takes matlab cpu array as input and transfer to gpu array(H2D), then launch cuda kernel: /*
* cuda lap slp pot & matlab mex interface
* 11/15/23 Hai, do not form matrix explicitly
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
// #include "cublas_v2.h"
/* these are for matlab */
#include "mex.h"
#include "gpu/mxGPUArray.h"
#define THRESH 1e-15
/*
* Device code
*/
void __global__ culapslppot(double const * const src,
double const * const targ,
double const * const x,
double * const y,
int const N,
int const M)
{
/* Calculate the global linear index, assuming a 1-d grid. */
int const i = blockDim.x * blockIdx.x + threadIdx.x;
double dx, dy, dz, dd, threshsq;
threshsq = THRESH*THRESH;
if (i<M) {
y[i] = 0.0;
for (int j = 0; j < N; ++j) {
dx = src[3*j] - targ[3*i];
dy = src[3*j+1] - targ[3*i+1];
dz = src[3*j+2] - targ[3*i+2];
dd = dx*dx + dy*dy + dz*dz;
if (dd>threshsq){
y[i] += x[j]*rsqrt(dd);
}
}
}
}
/*
* Host code (just slp potential)
*/
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, mxArray const *prhs[])
{
/* Declare all variables.*/
mxDouble *src, *targ, *x; /* input: source, target, density */
mxDouble *y; /* output: potential */
double *d_src, *d_targ;
double *d_x, *d_y;
float *curuntime;
//double *d_A;
int N, M; /* dimension */
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
char const * const errId = "parallel:gpu:culapslppot:InvalidInput";
char const * const errMsg = "Invalid input to MEX file.";
src = mxGetDoubles(prhs[0]); /* source */
targ = mxGetDoubles(prhs[1]); /* target */
x = mxGetDoubles(prhs[2]); /* density */
M = (int)(mxGetN(prhs[1]));
N = (int)(mxGetN(prhs[0]));
/* Choose a reasonably sized number of threads for the block. */
int const threadsPerBlock = 64;
int blocksPerGrid;
/* Initialize the MathWorks GPU API. */
mxInitGPU();
if ((nrhs != 3) || (nlhs != 2)) {
mexErrMsgIdAndTxt(errId, errMsg);
}
/* potential, no explicit lap slp mat */
blocksPerGrid = (M + threadsPerBlock - 1) / threadsPerBlock;
cudaMalloc((void**)&d_src, 3*N*sizeof(double));
cudaMemcpy(d_src, src, 3*N*sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_targ, 3*M*sizeof(double));
cudaMemcpy(d_targ, targ, 3*M*sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_x, N*sizeof(double));
cudaMemcpy(d_x, x, N*sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_y, M*sizeof(double));
cudaEventRecord(start);
culapslppot<<<blocksPerGrid, threadsPerBlock>>>(d_src, d_targ, d_x, d_y, N, M);
cudaDeviceSynchronize();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
/* Copy result back to host */
plhs[0] = mxCreateDoubleMatrix((mwSize)M, (mwSize)1, mxREAL);
y = mxGetDoubles(plhs[0]); /* 1st output */
cudaMemcpy(y, d_y, M*sizeof(double), cudaMemcpyDeviceToHost);
plhs[1] = mxCreateNumericMatrix(1,1,mxSINGLE_CLASS,mxREAL);
curuntime = (float *) mxGetData(plhs[1]);
curuntime[0] = milliseconds;
/* Free GPU memory */
cudaFree(d_src);
cudaFree(d_targ);
cudaFree(d_x);
cudaFree(d_y);
cudaEventDestroy(start);
cudaEventDestroy(stop);
} The following file 'mexFunction.cu' is a matlab mexfunction example to take native gpu array(no H2D transfer needed) as input, then launch cuda kernel: /*
* Example of how to use the mxGPUArray API in a MEX file. This example shows
* how to write a MEX function that takes a gpuArray input and returns a
* gpuArray output, e.g. B=mexFunction(A).
*
* Copyright 2012 The MathWorks, Inc.
*/
#include "mex.h"
#include "gpu/mxGPUArray.h"
/*
* Device code
*/
void __global__ TimesTwo(double const * const A,
double * const B,
int const N)
{
/* Calculate the global linear index, assuming a 1-d grid. */
int const i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) {
B[i] = 2.0 * A[i];
}
}
/*
* Host code
*/
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, mxArray const *prhs[])
{
/* Declare all variables.*/
mxGPUArray const *A;
mxGPUArray *B;
double const *d_A;
double *d_B;
int N;
char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput";
char const * const errMsg = "Invalid input to MEX file.";
/* Choose a reasonably sized number of threads for the block. */
int const threadsPerBlock = 256;
int blocksPerGrid;
/* Initialize the MathWorks GPU API. */
mxInitGPU();
/* Throw an error if the input is not a GPU array. */
if ((nrhs!=1) || !(mxIsGPUArray(prhs[0]))) {
mexErrMsgIdAndTxt(errId, errMsg);
}
A = mxGPUCreateFromMxArray(prhs[0]);
/*
* Verify that A really is a double array before extracting the pointer.
*/
if (mxGPUGetClassID(A) != mxDOUBLE_CLASS) {
mexErrMsgIdAndTxt(errId, errMsg);
}
/*
* Now that we have verified the data type, extract a pointer to the input
* data on the device.
*/
d_A = (double const *)(mxGPUGetDataReadOnly(A));
/* Create a GPUArray to hold the result and get its underlying pointer. */
B = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
mxGPUGetDimensions(A),
mxGPUGetClassID(A),
mxGPUGetComplexity(A),
MX_GPU_DO_NOT_INITIALIZE);
d_B = (double *)(mxGPUGetData(B));
/*
* Call the kernel using the CUDA runtime API. We are using a 1-d grid here,
* and it would be possible for the number of elements to be too large for
* the grid. For this example we are not guarding against this possibility.
*/
N = (int)(mxGPUGetNumberOfElements(A));
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
TimesTwo<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, N);
/* Wrap the result up as a MATLAB gpuArray for return. */
plhs[0] = mxGPUCreateMxArrayOnGPU(B);
/*
* The mxGPUArray pointers are host-side structures that refer to device
* data. These must be destroyed before leaving the MEX function.
*/
mxGPUDestroyGPUArray(A);
mxGPUDestroyGPUArray(B);
}
|
THanks Libin for these ideas - working directly with gpuarrays seems best. That needs some CUDA experience. One would wrap each of the four guru interface functions by hand, then. Also link to old cufinufft repo requests for this: flatironinstitute/cufinufft#59 |
Dear Alex Barnett and Libin Lu, thanks very much for your help! |
Hello people, I have a working mexcuda implementation to call cufinufft2d1 directly from matlab. It assumes all the input variables and data arrays are in single precision and stored on host(CPU). But I think if people want to have input data in the form of gpuArray, it is also easy to implement using the mxGPUGetData and other mxGPU interfaces. If anyone has an urgent need for other cufinufft interfaces, they can use this as a template to implement their own before the official cufinufft support for matlab became available. mexcuda code /*
* MATLAB to CUDA interface to call cufinufft 2D transform type 1 (NU->U) with float precision.
* Vectorized nufft call possible. This Function assumes all major data variables to be in single (float)
* precision and on CPU at input.
*
* Author: Junzhou Chen (junzhou[AT]chen[DOT]engineer)
*
*/
#include "mex.h"
#include "gpu/mxGPUArray.h"
#include <cufinufft.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <complex.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include <time.h>
#include <string.h>
/**
* Host code
*run in matlab im = cufinufft2d1(x,y,k,isign,eps,Nx,Ny, M, Ntrans)
*/
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, mxArray const *prhs[])
{
clock_t start, end;
double cpu_time_used;
/* Declare all variables.*/
float *x; // frequency domain dimension 1 coordinates
float *y; // frequency domain dimension 2 coordinates
float _Complex *k; // Pointer to complex frequency domain strengths, on host
mxComplexSingle * im ;// Pointer to complex image-domain data, on host
int isSign; //Whether to use + or - sign for complex exponential
float eps; //NUFFT precision
int64_t Nx; //Size of first dimension of image
int64_t Ny; //Size of second dimension of image
size_t M; // number of NU points
size_t Ntrans; // Number of stacked transforms
//int GPUDeviceIndex; //Which GPU to use
char const *const errId = "parallel:gpu:mexGPUExample:InvalidInput";
char const *const errMsg = "Invalid input to MEX file.";
char const *const instruction = "Usage:\n im = cufinufft2d1(x,y,k,isign,eps,Nx,Ny, M, Ntrans).";
char errorMessage[1024]; // remember to ensure this buffer is large enough for your message
// Check input number
if (nrhs != 9)
{
snprintf(errorMessage, sizeof(errorMessage), "Exactly 9 inputs required. \n%s", instruction);
mexErrMsgIdAndTxt("cufinufftf2d1:InputError", errorMessage);
}
// Check input type
if (!mxIsSingle(prhs[0]) || !mxIsSingle(prhs[1]) || !mxIsSingle(prhs[2]))
{
snprintf(errorMessage, sizeof(errorMessage), "Inputs 1-3 must be single. \n%s", instruction);
mexErrMsgIdAndTxt("cufinufftf2d1:InputError", errorMessage);
}
// Assign Pointers and value
x = mxGetSingles(prhs[0]);
y = mxGetSingles(prhs[1]);
k = (float _Complex *)mxGetComplexSingles(prhs[2]);
isSign = (int)mxGetScalar(prhs[3]);
eps = (float)mxGetScalar(prhs[4]);
Nx = (int64_t)mxGetScalar(prhs[5]);
Ny = (int64_t)mxGetScalar(prhs[6]);
M = (size_t)mxGetScalar(prhs[7]);
Ntrans = (size_t)mxGetScalar(prhs[8]);
//im = (float _Complex *) malloc((size_t)Nx * (size_t)Ny * Ntrans * sizeof(float _Complex));
im = (mxComplexSingle*)mxMalloc(Ny * Nx * Ntrans * sizeof(float _Complex));// output image
//GPUDeviceIndex = (int)mxGetScalar(prhs[9]) - 1; // k is zero based
//mexPrintf("Data Assignment Complete\n");
// Initialize the MathWorks GPU API.
mxInitGPU();
//cudaSetDevice(GPUDeviceIndex); //Set which GPU to run. This should strictly matach the one being used by MATLAB
// Create modes
int64_t modes[2] = {Nx, Ny};
// Create default cufinufft plan object
cufinufftf_plan plan;
// Create on device variables
float *d_x, *d_y;
cuFloatComplex *d_k, *d_im;
// Allocate device memories
cudaMalloc(&d_x, M * sizeof(float));
cudaMalloc(&d_y, M * sizeof(float));
cudaMalloc(&d_k, M * Ntrans * sizeof(float _Complex));
cudaMalloc(&d_im, (size_t)Nx * (size_t)Ny * Ntrans * sizeof(float _Complex));
//mexPrintf("cuda Malloc Complete\n");
// Copy values to device
start = clock();
cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_k, k, M * Ntrans * sizeof(float _Complex), cudaMemcpyHostToDevice);
end = clock();
double timeHostToDevice = ((double)(end-start))/ CLOCKS_PER_SEC;
mexPrintf("Host to Device Copying Time (s): %.5f\n", timeHostToDevice);
//mexPrintf("CUDA MEM COPY Complete\n");
// make cufinufft plan
/*
* int cufinufftf_makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntr, float tol, cufinufftf_plan *plan, cufinufft_opts *opts)
*/
cufinufftf_makeplan(1, 2, modes, isSign, Ntrans, eps, &plan, NULL);
//mexPrintf("cufinufftf_makeplan Complete\n");
// Set points
/*
* int cufinufftf_setpts(cufinufftf_plan plan, int M, float* x, float* y,loat* z, int N, float* s, float* t, float *u)
*/
cufinufftf_setpts(plan, M, d_x, d_y, NULL, 0, NULL, NULL, NULL);
//mexPrintf("cufinufftf_setpts Complete\n");
/************ EXECUTE ************/
start = clock();
cufinufftf_execute(plan, d_k, d_im);
end = clock();
double timeExecute = ((double)(end-start))/ CLOCKS_PER_SEC;
mexPrintf("cufinufftf execution Time (s): %.5f\n", timeExecute);
//mexPrintf("cufinufftf_execute Complete\n");
/************ EXECUTE END ************/
// // transfer the data back onto the host, destroy the plan, and free the device arrays.
start = clock();
cudaMemcpy(im, d_im, (size_t)Nx * (size_t)Ny * Ntrans * sizeof(float _Complex), cudaMemcpyDeviceToHost);
end = clock();
double timeDeviceToHost = ((double)(end-start))/ CLOCKS_PER_SEC;
mexPrintf("Device to Host Copying Time (s): %.5f\n", timeDeviceToHost);
cufinufftf_destroy(plan);
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_k);
cudaFree(d_im);
//mexPrintf("cufinufftf_cleancup Complete\n");
//Create output
mwSize out_dimensions[3] = {(size_t)Nx, (size_t)Ny, Ntrans};
const mwSize * ndims = out_dimensions;
plhs[0] = mxCreateNumericArray(3, ndims, mxSINGLE_CLASS, mxCOMPLEX);
//mexPrintf("Create output Complete\n");
//Set Array Value
mxSetComplexSingles(plhs[0], im);
return;
}
The problem is that I only managed to compile this in matlab command window: mexcuda -R2018a -I[path/to/finufft/include] -L[path/to/finufft/build] -lcufinufft cufinufftf2d1.cu
% -R2018a flag because the interleaved complex number api is used
% https://www.mathworks.com/help/matlab/ref/mex.html If you are using linux, for example, before being able to run the compiled binary in matlab, you will need to let matlab know where to find the cufinufft dynamic library before launching matlab: export LD_LIBRARY_PATH=path/to/finufft/build:$LD_LIBRARY_PATH For other OS, read this manual. It is more ideal to compile with cmake. There seems to be a general solution to compile mexcuda with cmake but I didn't manage to make it work for this code. |
Nice demo - thanks for posting - I hope others find useful. I haven't had time to test it - maybe someone will? Acting directly on gpuArrays would probably be the direction we would go, if we were to bring this into our repo, to avoid the cudamemcpy's. Cheers, Alex |
Can we use MATLAB to call functions which are associated with cuFINUFFT ?Thanks!
The text was updated successfully, but these errors were encountered: