Merge pull request #147 from junjihashimoto/feature/cuda

junjihashimoto · web-flow · commit cc60d93d3975 · 2023-09-29T20:54:34.000+09:00
Add inline-c-cuda package for CUDA
diff --git a/inline-c-cuda/LICENSE b/inline-c-cuda/LICENSE
@@ -0,0 +1,20 @@
+Copyright (c) 2015 FP Complete Corporation.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/inline-c-cuda/README.md b/inline-c-cuda/README.md
@@ -0,0 +1 @@
+Small set of utilities to inline CUDA code.  See tests for example.
diff --git a/inline-c-cuda/Setup.hs b/inline-c-cuda/Setup.hs
@@ -0,0 +1,2 @@
+import Distribution.Simple
+main = defaultMain
diff --git a/inline-c-cuda/inline-c-cuda.cabal b/inline-c-cuda/inline-c-cuda.cabal
@@ -0,0 +1,50 @@
+cabal-version:       2.2
+name:                inline-c-cuda
+version:             0.1.0.0
+synopsis:            Lets you embed CUDA code into Haskell.
+description:         Utilities to inline CUDA code into Haskell using inline-c.  See
+                     tests for example on how to build.
+license:             MIT
+license-file:        LICENSE
+author:              Francesco Mazzoli
+maintainer:          f@mazzo.li
+copyright:           (c) 2015-2016 FP Complete Corporation, (c) 2017-2019 Francesco Mazzoli
+category:            FFI
+tested-with:         GHC == 9.2.2
+build-type:          Simple
+
+source-repository head
+  type:     git
+  location: https://github.com/fpco/inline-c
+
+library
+  exposed-modules:     Language.C.Inline.Cuda
+  build-depends:       base >=4.7 && <5
+                     , bytestring
+                     , inline-c >= 0.9.0.0
+                     , inline-c-cpp
+                     , template-haskell
+                     , text
+                     , safe-exceptions
+                     , containers
+                     , process
+  hs-source-dirs:      src
+  default-language:    Haskell2010
+  ghc-options:         -Wall
+  extra-libraries:     cudart
+
+test-suite tests
+  type:                exitcode-stdio-1.0
+  hs-source-dirs:      test
+  main-is:             tests.hs
+  build-depends:       base >=4 && <5
+                     , bytestring
+                     , inline-c
+                     , inline-c-cpp
+                     , inline-c-cuda
+                     , safe-exceptions
+                     , hspec
+                     , containers
+                     , template-haskell
+                     , vector
+  default-language:    Haskell2010
diff --git a/inline-c-cuda/src/Language/C/Inline/Cuda.hs b/inline-c-cuda/src/Language/C/Inline/Cuda.hs
@@ -0,0 +1,53 @@
+{-# LANGUAGE QuasiQuotes #-}
+{-# LANGUAGE TemplateHaskell #-}
+{-# LANGUAGE OverloadedStrings #-}
+
+-- | Module exposing a 'Context' to inline CUDA code.  We only have used
+-- this for experiments, so use with caution.  See the CUDA tests to see
+-- how to build inline CUDA code.
+module Language.C.Inline.Cuda
+  ( module Language.C.Inline
+  , cudaCtx
+  , Cpp.cppTypePairs
+  , Cpp.using
+  , Cpp.AbstractCppExceptionPtr
+  ) where
+
+import qualified Language.Haskell.TH as TH
+import qualified Language.Haskell.TH.Syntax as TH
+
+import           Language.C.Inline
+import           Language.C.Inline.Context
+import qualified Language.C.Types as CT
+import qualified Language.C.Inline.Cpp as Cpp
+
+import qualified Data.Map as Map
+import           Control.Monad.IO.Class (liftIO)
+import           System.Exit (ExitCode(..))
+import           System.Process (readProcessWithExitCode)
+
+compileCuda :: String -> TH.Q FilePath
+compileCuda src = do
+  cuFile <- TH.addTempFile "cu"
+  oFile <- TH.addTempFile "o"
+  let (cmd,args) = ("nvcc", ["-c","-o",oFile, cuFile])
+  (code, stdout, stderr) <- liftIO $ do
+    writeFile cuFile src
+    readProcessWithExitCode cmd args ""
+  case code of
+    ExitFailure _ -> fail $ "Compile Command: " ++ (foldl (\a b -> a ++ " " ++ b) " " (cmd : args)) ++ "\n" ++ "    Output: " ++ stdout ++ "\n" ++ "    Error: " ++ stderr
+    ExitSuccess -> return oFile
+
+-- | The equivalent of 'C.baseCtx' for CUDA.  It specifies the @.cu@
+-- file extension for the CUDA file, so that nvcc will decide to build CUDA
+-- instead of C.  See the @.cabal@ test target for an example on how to
+-- build.
+cudaCtx :: Context
+cudaCtx = Cpp.cppCtx <> mempty
+  { ctxForeignSrcLang = Just TH.RawObject
+  , ctxOutput = Just $ \s -> "extern \"C\" {\n" ++ s ++ "\n}"
+  , ctxEnableCpp = True
+  , ctxRawObjectCompile = Just compileCuda
+  , ctxTypesTable = Map.singleton (CT.TypeName "std::exception_ptr") [t|Cpp.AbstractCppExceptionPtr|]
+  }
+
diff --git a/inline-c-cuda/test/tests.hs b/inline-c-cuda/test/tests.hs
@@ -0,0 +1,133 @@
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE ConstraintKinds #-}
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE KindSignatures #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE PolyKinds #-}
+{-# LANGUAGE QuasiQuotes #-}
+{-# LANGUAGE RankNTypes #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TemplateHaskell #-}
+{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE TypeInType #-}
+{-# LANGUAGE TypeOperators #-}
+{-# LANGUAGE UndecidableInstances #-}
+{-# LANGUAGE TypeApplications #-}
+{-# OPTIONS_GHC -Wno-deprecations #-}
+
+import           Control.Exception.Safe
+import           Control.Monad
+import qualified Language.C.Inline.Context as CC
+import qualified Language.C.Types as CT
+import qualified Language.C.Inline.Cuda as C
+import qualified Test.Hspec as Hspec
+import           Test.Hspec (shouldBe)
+import           Foreign.Ptr (Ptr)
+import           Data.Monoid
+import           Foreign.Marshal.Array
+import           Foreign.Marshal.Alloc
+import           Foreign.Storable
+
+
+C.context $ C.cudaCtx
+
+C.include "<iostream>"
+C.include "<stdexcept>"
+
+[C.emitBlock|
+__global__ void
+vectorAdd(const float *A, const float *B, float *C, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        C[i] = A[i] + B[i];
+    }
+}
+|]
+
+cudaAllocaArray :: forall b. Int -> (Ptr C.CFloat -> IO b) -> IO b
+cudaAllocaArray size func = do
+  let csize = fromIntegral size
+  alloca $ \(ptr_d_A :: Ptr (Ptr C.CFloat)) -> do
+    [C.block| void {
+      cudaError_t err = cudaMalloc((void **)$(float** ptr_d_A), $(int csize) * sizeof(float));
+      if (err != cudaSuccess)
+      {
+          fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
+          exit(EXIT_FAILURE);
+      }
+    } |]
+    d_A <- peekElemOff ptr_d_A 0
+    ret <- func d_A
+    [C.block| void {
+      cudaError_t err = cudaFree($(float* d_A));
+      if (err != cudaSuccess)
+      {
+          fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
+          exit(EXIT_FAILURE);
+      }
+    } |]
+    return ret
+
+cudaMemcpyHostToDevice :: Int -> Ptr C.CFloat -> Ptr C.CFloat -> IO ()
+cudaMemcpyHostToDevice num host device = do
+  let cnum = fromIntegral num
+  [C.block| void {
+      cudaError_t err = cudaMemcpy($(float* device), $(float* host), $(int cnum) * sizeof(float), cudaMemcpyHostToDevice);
+      if (err != cudaSuccess)
+      {
+          fprintf(stderr, "Failed to copy vector from host to device (error code %s)!\n", cudaGetErrorString(err));
+          exit(EXIT_FAILURE);
+      }
+  } |]
+
+cudaMemcpyDeviceToHost :: Int -> Ptr C.CFloat -> Ptr C.CFloat -> IO ()
+cudaMemcpyDeviceToHost num device host = do
+  let cnum = fromIntegral num
+  [C.block| void {
+      cudaError_t err = cudaMemcpy($(float* host), $(float* device), $(int cnum) * sizeof(float), cudaMemcpyDeviceToHost);
+      if (err != cudaSuccess)
+      {
+          fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
+          exit(EXIT_FAILURE);
+      }
+  } |]
+
+
+main :: IO ()
+main = Hspec.hspec $ do
+  Hspec.describe "Basic CUDA" $ do
+    Hspec.it "Add vectors on device" $ do
+      let numElements = 50000
+          cNumElements = fromIntegral numElements
+      allocaArray numElements $ \(h_A :: Ptr C.CFloat) -> do
+        allocaArray numElements $ \(h_B :: Ptr C.CFloat) -> do
+          allocaArray numElements $ \(h_C :: Ptr C.CFloat) -> do
+            cudaAllocaArray numElements $ \(d_A :: Ptr C.CFloat) -> do
+              cudaAllocaArray numElements $ \(d_B :: Ptr C.CFloat) -> do
+                cudaAllocaArray numElements $ \(d_C :: Ptr C.CFloat) -> do
+                  [C.block| void {
+                    for (int i = 0; i < $(int cNumElements); ++i)
+                    {
+                        $(float* h_A)[i] = rand()/(float)RAND_MAX;
+                        $(float* h_B)[i] = rand()/(float)RAND_MAX;
+                    }
+                  } |]
+                  cudaMemcpyHostToDevice numElements h_A d_A
+                  cudaMemcpyHostToDevice numElements h_B d_B
+                  [C.block| void {
+                    int threadsPerBlock = 256;
+                    int blocksPerGrid =($(int cNumElements) + threadsPerBlock - 1) / threadsPerBlock;
+                    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>($(float* d_A), $(float* d_B), $(float* d_C), $(int cNumElements));
+                  } |]
+                  cudaMemcpyDeviceToHost numElements d_C h_C
+                  lA <- peekArray numElements h_A
+                  lB <- peekArray numElements h_B
+                  lC <- peekArray numElements h_C
+                  all (< 1e-5) (map (\((a,b),c) -> abs(a + b - c)) (zip (zip lA lB) lC)) `shouldBe` True
diff --git a/inline-c/src/Language/C/Inline.hs b/inline-c/src/Language/C/Inline.hs
@@ -42,6 +42,7 @@ module Language.C.Inline
   , block
   , include
   , verbatim
+  , emitBlock
 
     -- * 'Ptr' utils
   , withPtr
diff --git a/inline-c/src/Language/C/Inline/Context.hs b/inline-c/src/Language/C/Inline/Context.hs
@@ -161,6 +161,8 @@ data Context = Context
   , ctxForeignSrcLang :: Maybe TH.ForeignSrcLang
     -- ^ TH.LangC by default
   , ctxEnableCpp :: Bool
+    -- ^ Compile source code to raw object.
+  , ctxRawObjectCompile :: Maybe (String -> TH.Q FilePath)
   }
 
 
@@ -172,6 +174,7 @@ instance Semigroup Context where
     , ctxOutput = ctxOutput ctx1 <|> ctxOutput ctx2
     , ctxForeignSrcLang = ctxForeignSrcLang ctx1 <|> ctxForeignSrcLang ctx2
     , ctxEnableCpp = ctxEnableCpp ctx1 || ctxEnableCpp ctx2
+    , ctxRawObjectCompile = ctxRawObjectCompile ctx1 <|> ctxRawObjectCompile ctx2
     }
 #endif
 
@@ -182,6 +185,7 @@ instance Monoid Context where
     , ctxOutput = Nothing
     , ctxForeignSrcLang = Nothing
     , ctxEnableCpp = False
+    , ctxRawObjectCompile = Nothing
     }
 
 #if !MIN_VERSION_base(4,11,0)
@@ -191,6 +195,7 @@ instance Monoid Context where
     , ctxOutput = ctxOutput ctx1 <|> ctxOutput ctx2
     , ctxForeignSrcLang = ctxForeignSrcLang ctx1 <|> ctxForeignSrcLang ctx2
     , ctxEnableCpp = ctxEnableCpp ctx1 || ctxEnableCpp ctx2
+    , ctxRawObjectCompile = ctxRawObjectCompile ctx1 <|> ctxRawObjectCompile ctx2
     }
 #endif
 
diff --git a/inline-c/src/Language/C/Inline/Internal.hs b/inline-c/src/Language/C/Inline/Internal.hs
@@ -32,6 +32,7 @@ module Language.C.Inline.Internal
 
       -- ** Emitting C code
     , emitVerbatim
+    , emitBlock
 
       -- ** Inlining C code
       -- $embedding
@@ -168,11 +169,16 @@ initialiseModuleState mbContext = do
           Nothing -> fail "inline-c: ModuleState not present (initialiseModuleState)"
           Just ms -> return ms
         let lang = fromMaybe TH.LangC (ctxForeignSrcLang context)
+            addForeignSource =
 #if MIN_VERSION_base(4,12,0)
-        TH.addForeignSource lang (concat (reverse (msFileChunks ms)))
+              TH.addForeignSource
 #else
-        TH.addForeignFile lang (concat (reverse (msFileChunks ms)))
+              TH.addForeignFile
 #endif
+            src = (concat (reverse (msFileChunks ms)))
+        case (lang, ctxRawObjectCompile context) of
+          (TH.RawObject, Just compile) -> compile src >>= TH.addForeignFilePath lang
+          (_, _)  -> addForeignSource lang src
       let moduleState = ModuleState
             { msContext = context
             , msGeneratedNames = 0
@@ -234,6 +240,15 @@ emitVerbatim s = do
     (ms{msFileChunks = chunk : msFileChunks ms}, ())
   return []
 
+-- | Simply appends some string of block to the module's C file.  Use with care.
+emitBlock :: TH.QuasiQuoter
+emitBlock = TH.QuasiQuoter
+  { TH.quoteExp = const $ fail "inline-c: quoteExp not implemented (quoteCode)"
+  , TH.quotePat = const $ fail "inline-c: quotePat not implemented (quoteCode)"
+  , TH.quoteType = const $ fail "inline-c: quoteType not implemented (quoteCode)"
+  , TH.quoteDec = emitVerbatim
+  }
+
 ------------------------------------------------------------------------
 -- Inlining
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Small set of utilities to inline CUDA code. See tests for example.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+import Distribution.Simple`
	`2`	`+main = defaultMain`