|
| 1 | +{-# LANGUAGE CPP #-} |
| 2 | +{-# LANGUAGE ConstraintKinds #-} |
| 3 | +{-# LANGUAGE DataKinds #-} |
| 4 | +{-# LANGUAGE FlexibleContexts #-} |
| 5 | +{-# LANGUAGE FlexibleInstances #-} |
| 6 | +{-# LANGUAGE GADTs #-} |
| 7 | +{-# LANGUAGE KindSignatures #-} |
| 8 | +{-# LANGUAGE MultiParamTypeClasses #-} |
| 9 | +{-# LANGUAGE OverloadedStrings #-} |
| 10 | +{-# LANGUAGE PolyKinds #-} |
| 11 | +{-# LANGUAGE QuasiQuotes #-} |
| 12 | +{-# LANGUAGE RankNTypes #-} |
| 13 | +{-# LANGUAGE ScopedTypeVariables #-} |
| 14 | +{-# LANGUAGE TemplateHaskell #-} |
| 15 | +{-# LANGUAGE TypeFamilies #-} |
| 16 | +{-# LANGUAGE TypeInType #-} |
| 17 | +{-# LANGUAGE TypeOperators #-} |
| 18 | +{-# LANGUAGE UndecidableInstances #-} |
| 19 | +{-# LANGUAGE TypeApplications #-} |
| 20 | +{-# OPTIONS_GHC -Wno-deprecations #-} |
| 21 | + |
| 22 | +import Control.Exception.Safe |
| 23 | +import Control.Monad |
| 24 | +import qualified Language.C.Inline.Context as CC |
| 25 | +import qualified Language.C.Types as CT |
| 26 | +import qualified Language.C.Inline.Cuda as C |
| 27 | +import qualified Test.Hspec as Hspec |
| 28 | +import Test.Hspec (shouldBe) |
| 29 | +import Foreign.Ptr (Ptr) |
| 30 | +import Data.Monoid |
| 31 | +import Foreign.Marshal.Array |
| 32 | +import Foreign.Marshal.Alloc |
| 33 | +import Foreign.Storable |
| 34 | + |
| 35 | + |
| 36 | +C.context $ C.cudaCtx |
| 37 | + |
| 38 | +C.include "<iostream>" |
| 39 | +C.include "<stdexcept>" |
| 40 | + |
| 41 | +[C.emitBlock| |
| 42 | +__global__ void |
| 43 | +vectorAdd(const float *A, const float *B, float *C, int numElements) |
| 44 | +{ |
| 45 | + int i = blockDim.x * blockIdx.x + threadIdx.x; |
| 46 | + |
| 47 | + if (i < numElements) |
| 48 | + { |
| 49 | + C[i] = A[i] + B[i]; |
| 50 | + } |
| 51 | +} |
| 52 | +|] |
| 53 | + |
| 54 | +cudaAllocaArray :: forall b. Int -> (Ptr C.CFloat -> IO b) -> IO b |
| 55 | +cudaAllocaArray size func = do |
| 56 | + let csize = fromIntegral size |
| 57 | + alloca $ \(ptr_d_A :: Ptr (Ptr C.CFloat)) -> do |
| 58 | + [C.block| void { |
| 59 | + cudaError_t err = cudaMalloc((void **)$(float** ptr_d_A), $(int csize) * sizeof(float)); |
| 60 | + if (err != cudaSuccess) |
| 61 | + { |
| 62 | + fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err)); |
| 63 | + exit(EXIT_FAILURE); |
| 64 | + } |
| 65 | + } |] |
| 66 | + d_A <- peekElemOff ptr_d_A 0 |
| 67 | + ret <- func d_A |
| 68 | + [C.block| void { |
| 69 | + cudaError_t err = cudaFree($(float* d_A)); |
| 70 | + if (err != cudaSuccess) |
| 71 | + { |
| 72 | + fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err)); |
| 73 | + exit(EXIT_FAILURE); |
| 74 | + } |
| 75 | + } |] |
| 76 | + return ret |
| 77 | + |
| 78 | +cudaMemcpyHostToDevice :: Int -> Ptr C.CFloat -> Ptr C.CFloat -> IO () |
| 79 | +cudaMemcpyHostToDevice num host device = do |
| 80 | + let cnum = fromIntegral num |
| 81 | + [C.block| void { |
| 82 | + cudaError_t err = cudaMemcpy($(float* device), $(float* host), $(int cnum) * sizeof(float), cudaMemcpyHostToDevice); |
| 83 | + if (err != cudaSuccess) |
| 84 | + { |
| 85 | + fprintf(stderr, "Failed to copy vector from host to device (error code %s)!\n", cudaGetErrorString(err)); |
| 86 | + exit(EXIT_FAILURE); |
| 87 | + } |
| 88 | + } |] |
| 89 | + |
| 90 | +cudaMemcpyDeviceToHost :: Int -> Ptr C.CFloat -> Ptr C.CFloat -> IO () |
| 91 | +cudaMemcpyDeviceToHost num device host = do |
| 92 | + let cnum = fromIntegral num |
| 93 | + [C.block| void { |
| 94 | + cudaError_t err = cudaMemcpy($(float* host), $(float* device), $(int cnum) * sizeof(float), cudaMemcpyDeviceToHost); |
| 95 | + if (err != cudaSuccess) |
| 96 | + { |
| 97 | + fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err)); |
| 98 | + exit(EXIT_FAILURE); |
| 99 | + } |
| 100 | + } |] |
| 101 | + |
| 102 | + |
| 103 | +main :: IO () |
| 104 | +main = Hspec.hspec $ do |
| 105 | + Hspec.describe "Basic CUDA" $ do |
| 106 | + Hspec.it "Add vectors on device" $ do |
| 107 | + let numElements = 50000 |
| 108 | + cNumElements = fromIntegral numElements |
| 109 | + allocaArray numElements $ \(h_A :: Ptr C.CFloat) -> do |
| 110 | + allocaArray numElements $ \(h_B :: Ptr C.CFloat) -> do |
| 111 | + allocaArray numElements $ \(h_C :: Ptr C.CFloat) -> do |
| 112 | + cudaAllocaArray numElements $ \(d_A :: Ptr C.CFloat) -> do |
| 113 | + cudaAllocaArray numElements $ \(d_B :: Ptr C.CFloat) -> do |
| 114 | + cudaAllocaArray numElements $ \(d_C :: Ptr C.CFloat) -> do |
| 115 | + [C.block| void { |
| 116 | + for (int i = 0; i < $(int cNumElements); ++i) |
| 117 | + { |
| 118 | + $(float* h_A)[i] = rand()/(float)RAND_MAX; |
| 119 | + $(float* h_B)[i] = rand()/(float)RAND_MAX; |
| 120 | + } |
| 121 | + } |] |
| 122 | + cudaMemcpyHostToDevice numElements h_A d_A |
| 123 | + cudaMemcpyHostToDevice numElements h_B d_B |
| 124 | + [C.block| void { |
| 125 | + int threadsPerBlock = 256; |
| 126 | + int blocksPerGrid =($(int cNumElements) + threadsPerBlock - 1) / threadsPerBlock; |
| 127 | + vectorAdd<<<blocksPerGrid, threadsPerBlock>>>($(float* d_A), $(float* d_B), $(float* d_C), $(int cNumElements)); |
| 128 | + } |] |
| 129 | + cudaMemcpyDeviceToHost numElements d_C h_C |
| 130 | + lA <- peekArray numElements h_A |
| 131 | + lB <- peekArray numElements h_B |
| 132 | + lC <- peekArray numElements h_C |
| 133 | + all (< 1e-5) (map (\((a,b),c) -> abs(a + b - c)) (zip (zip lA lB) lC)) `shouldBe` True |
0 commit comments