@@ -37,6 +37,57 @@ C.context $ C.cudaCtx
3737
3838C. include " <iostream>"
3939C. include " <stdexcept>"
40+ C. include " <cstring>"
41+
42+ #ifdef TEST_WITHOUT_CUDA
43+
44+ [C. emitBlock |
45+
46+ void
47+ vectorAdd(int blocksPerGrid, int threadsPerBlock, const float *A, const float *B, float *C, int numElements)
48+ {
49+ for(int blockIdx = 0; blockIdx < blocksPerGrid ; blockIdx++){
50+ int blockDim = threadsPerBlock;
51+ for(int threadIdx = 0; threadIdx < threadsPerBlock ; threadIdx++){
52+ int i = blockDim * blockIdx + threadIdx;
53+
54+ if (i < numElements)
55+ {
56+ C[i] = A[i] + B[i];
57+ }
58+ }
59+ }
60+ }
61+
62+
63+ typedef int cudaError_t;
64+ const int cudaSuccess = 1;
65+
66+ cudaError_t cudaMalloc(void** dst, size_t size){
67+ *dst = malloc(size);
68+ return cudaSuccess;
69+ }
70+
71+ cudaError_t cudaFree(void* dst){
72+ free(dst);
73+ return cudaSuccess;
74+ }
75+
76+ const int cudaMemcpyHostToDevice = 0;
77+ const int cudaMemcpyDeviceToHost = 1;
78+
79+ cudaError_t cudaMemcpy(void *dst, void *src, size_t nbytes, int direction){
80+ memcpy(dst, src, nbytes);
81+ return cudaSuccess;
82+ }
83+
84+ char* cudaGetErrorString(cudaError_t err){
85+ return "";
86+ }
87+
88+ |]
89+
90+ #else
4091
4192[C. emitBlock |
4293__global__ void
@@ -51,6 +102,8 @@ vectorAdd(const float *A, const float *B, float *C, int numElements)
51102}
52103|]
53104
105+ #endif
106+
54107cudaAllocaArray :: forall b . Int -> (Ptr C. CFloat -> IO b ) -> IO b
55108cudaAllocaArray size func = do
56109 let csize = fromIntegral size
@@ -121,11 +174,19 @@ main = Hspec.hspec $ do
121174 } |]
122175 cudaMemcpyHostToDevice numElements h_A d_A
123176 cudaMemcpyHostToDevice numElements h_B d_B
177+ #ifdef TEST_WITHOUT_CUDA
178+ [C. block | void {
179+ const int threadsPerBlock = 256;
180+ const int blocksPerGrid =($(int cNumElements) + threadsPerBlock - 1) / threadsPerBlock;
181+ vectorAdd(blocksPerGrid, threadsPerBlock, $(float* d_A), $(float* d_B), $(float* d_C), $(int cNumElements));
182+ } |]
183+ #else
124184 [C. block | void {
125- int threadsPerBlock = 256;
126- int blocksPerGrid =($(int cNumElements) + threadsPerBlock - 1) / threadsPerBlock;
185+ const int threadsPerBlock = 256;
186+ const int blocksPerGrid =($(int cNumElements) + threadsPerBlock - 1) / threadsPerBlock;
127187 vectorAdd<<<blocksPerGrid, threadsPerBlock>>>($(float* d_A), $(float* d_B), $(float* d_C), $(int cNumElements));
128188 } |]
189+ #endif
129190 cudaMemcpyDeviceToHost numElements d_C h_C
130191 lA <- peekArray numElements h_A
131192 lB <- peekArray numElements h_B
0 commit comments