style(cuda): add more comments

pplmx · pplmx · commit 116437ce7682 · 2024-09-11T14:32:45.000+08:00
diff --git a/template/cuda/{{cookiecutter.project_slug}}/src/matrix_add.cu b/template/cuda/{{cookiecutter.project_slug}}/src/matrix_add.cu
@@ -1,58 +1,82 @@
-#include "cuda_utils.h"
-#include "matrix_add.h"
+#include "cuda_utils.h"  // Custom CUDA utilities for error checking, etc.
+#include "matrix_add.h"   // Header file for this matrix addition module
 
+// Namespace to encapsulate CUDA kernel functions
 namespace cuda_kernel {
 
+// CUDA Kernel: Adds two matrices element-wise on the GPU
+// Each thread computes a single element of the result matrix
+// Parameters:
+// - matrixA: Device pointer to the input matrix A
+// - matrixB: Device pointer to the input matrix B
+// - resultMatrix: Device pointer to the output result matrix
+// - numRows: Number of rows in the matrices
+// - numCols: Number of columns in the matrices
 template <typename T>
 __global__ void addMatricesKernel(const T* matrixA, const T* matrixB, T* resultMatrix, int numRows, int numCols) {
+    // Calculate the row and column indices for this thread
     int row = blockIdx.y * blockDim.y + threadIdx.y;
     int col = blockIdx.x * blockDim.x + threadIdx.x;
 
+    // Ensure the thread is within valid matrix bounds
     if (row < numRows && col < numCols) {
         int index = row * numCols + col;
-        resultMatrix[index] = matrixA[index] + matrixB[index];
+        resultMatrix[index] = matrixA[index] + matrixB[index];  // Perform element-wise addition
     }
 }
 
 } // namespace cuda_kernel
 
+// C++ Function: Handles matrix addition on the GPU
+// Transfers matrices from the host (CPU) to the device (GPU), performs the computation,
+// and then copies the result back to the host.
+// Parameters:
+// - hostMatrixA: Pointer to matrix A on the host (CPU)
+// - hostMatrixB: Pointer to matrix B on the host (CPU)
+// - hostResultMatrix: Pointer to the result matrix on the host (CPU)
+// - numRows: Number of rows in the matrices
+// - numCols: Number of columns in the matrices
 template <typename T>
 void addMatricesOnGPU(const T* hostMatrixA, const T* hostMatrixB, T* hostResultMatrix, int numRows, int numCols) {
+    // Calculate the size of the matrices in bytes
     size_t matrixSizeBytes = numRows * numCols * sizeof(T);
 
+    // Device (GPU) memory pointers
     T *deviceMatrixA, *deviceMatrixB, *deviceResultMatrix;
 
-    CUDA_CHECK(cudaMalloc(&deviceMatrixA, matrixSizeBytes));
-    CUDA_CHECK(cudaMalloc(&deviceMatrixB, matrixSizeBytes));
-    CUDA_CHECK(cudaMalloc(&deviceResultMatrix, matrixSizeBytes));
+    // Allocate memory on the device (GPU)
+    CUDA_CHECK(cudaMalloc(&deviceMatrixA, matrixSizeBytes));  // Allocate memory for matrix A
+    CUDA_CHECK(cudaMalloc(&deviceMatrixB, matrixSizeBytes));  // Allocate memory for matrix B
+    CUDA_CHECK(cudaMalloc(&deviceResultMatrix, matrixSizeBytes));  // Allocate memory for the result matrix
 
-    // Copy input matrices from host to device
+    // Copy input matrices from host (CPU) to device (GPU)
     CUDA_CHECK(cudaMemcpy(deviceMatrixA, hostMatrixA, matrixSizeBytes, cudaMemcpyHostToDevice));
     CUDA_CHECK(cudaMemcpy(deviceMatrixB, hostMatrixB, matrixSizeBytes, cudaMemcpyHostToDevice));
 
-    // Define the grid and block dimensions
-    dim3 threadsPerBlock(16, 16);
+    // Define grid and block dimensions for launching the kernel
+    dim3 threadsPerBlock(16, 16);  // Each block contains 16x16 threads
     dim3 numBlocks((numCols + threadsPerBlock.x - 1) / threadsPerBlock.x,
-                   (numRows + threadsPerBlock.y - 1) / threadsPerBlock.y);
+                   (numRows + threadsPerBlock.y - 1) / threadsPerBlock.y);  // Calculate number of blocks required
 
+    // Launch the CUDA kernel to add the matrices on the device
     cuda_kernel::addMatricesKernel<<<numBlocks, threadsPerBlock>>>(
         deviceMatrixA, deviceMatrixB, deviceResultMatrix, numRows, numCols);
 
-    // Check for errors
+    // Check for kernel launch errors
     CUDA_CHECK(cudaGetLastError());
 
-    // Wait for GPU to finish
+    // Synchronize the device to ensure kernel execution is complete
     CUDA_CHECK(cudaDeviceSynchronize());
 
-    // Copy the result back to host memory
+    // Copy the result matrix from device (GPU) back to host (CPU)
     CUDA_CHECK(cudaMemcpy(hostResultMatrix, deviceResultMatrix, matrixSizeBytes, cudaMemcpyDeviceToHost));
 
-    // Free GPU memory
+    // Free the allocated memory on the device
     CUDA_CHECK(cudaFree(deviceMatrixA));
     CUDA_CHECK(cudaFree(deviceMatrixB));
     CUDA_CHECK(cudaFree(deviceResultMatrix));
 }
 
-// Explicit instantiations
+// Explicit template instantiations for float and double types
 template void addMatricesOnGPU<float>(const float*, const float*, float*, int, int);
 template void addMatricesOnGPU<double>(const double*, const double*, double*, int, int);
diff --git a/template/cuda/{{cookiecutter.project_slug}}/src/matrix_mult.cu b/template/cuda/{{cookiecutter.project_slug}}/src/matrix_mult.cu
@@ -1,62 +1,83 @@
-#include "cuda_utils.h"
-#include "matrix_mult.h"
+#include "cuda_utils.h"   // Custom CUDA utility functions and macros for error checking
+#include "matrix_mult.h"   // Header for this matrix multiplication module
 
+// Function to perform matrix multiplication on the GPU using cuBLAS
+// This function transfers the input matrices from the host (CPU) to the device (GPU),
+// executes the matrix multiplication on the GPU, and retrieves the result back to the host.
+// Parameters:
+// - hostMatrixA: Pointer to the first matrix (A) on the host (CPU)
+// - hostMatrixB: Pointer to the second matrix (B) on the host (CPU)
+// - hostResultMatrix: Pointer to the result matrix (C) on the host (CPU)
+// - numRowsA: Number of rows in matrix A
+// - numColsA: Number of columns in matrix A (and rows in matrix B)
+// - numColsB: Number of columns in matrix B
 template <typename T>
 void multiplyMatricesOnGPU(const T* hostMatrixA, const T* hostMatrixB, T* hostResultMatrix,
                            int numRowsA, int numColsA, int numColsB) {
+    // Calculate the size of matrices A, B, and C in bytes
     size_t byteSizeA = numRowsA * numColsA * sizeof(T);
     size_t byteSizeB = numColsA * numColsB * sizeof(T);
     size_t byteSizeC = numRowsA * numColsB * sizeof(T);
 
+    // Device (GPU) memory pointers for matrices A, B, and result matrix C
     T *deviceMatrixA, *deviceMatrixB, *deviceResultMatrix;
 
-    // Allocate memory on the GPU
-    CUDA_CHECK(cudaMalloc(&deviceMatrixA, byteSizeA));
-    CUDA_CHECK(cudaMalloc(&deviceMatrixB, byteSizeB));
-    CUDA_CHECK(cudaMalloc(&deviceResultMatrix, byteSizeC));
+    // Allocate memory for matrices on the GPU
+    CUDA_CHECK(cudaMalloc(&deviceMatrixA, byteSizeA));  // Allocate memory for matrix A on the GPU
+    CUDA_CHECK(cudaMalloc(&deviceMatrixB, byteSizeB));  // Allocate memory for matrix B on the GPU
+    CUDA_CHECK(cudaMalloc(&deviceResultMatrix, byteSizeC));  // Allocate memory for result matrix C on the GPU
 
-    // Copy input matrices from host to device
+    // Copy matrices A and B from the host (CPU) to the device (GPU)
     CUDA_CHECK(cudaMemcpy(deviceMatrixA, hostMatrixA, byteSizeA, cudaMemcpyHostToDevice));
     CUDA_CHECK(cudaMemcpy(deviceMatrixB, hostMatrixB, byteSizeB, cudaMemcpyHostToDevice));
 
+    // Create a cuBLAS handle for matrix multiplication
     cublasHandle_t cublasHandle;
     CUBLAS_CHECK(cublasCreate(&cublasHandle));
 
+    // Define alpha and beta scalars for the matrix multiplication: C = alpha * A * B + beta * C
     const T alpha = 1.0;
     const T beta = 0.0;
 
-    // Perform matrix multiplication using cuBLAS
+    // Perform matrix multiplication using cuBLAS based on the type of T (float or double)
+    // For float: Use cublasSgemm (single precision)
     if constexpr (std::is_same_v<T, float>) {
         CUBLAS_CHECK(cublasSgemm(cublasHandle,
-                                 CUBLAS_OP_N, CUBLAS_OP_N,
-                                 numColsB, numRowsA, numColsA,
-                                 &alpha,
-                                 deviceMatrixB, numColsB,
-                                 deviceMatrixA, numColsA,
-                                 &beta,
-                                 deviceResultMatrix, numColsB));
-    } else if constexpr (std::is_same_v<T, double>) {
+                                 CUBLAS_OP_N, CUBLAS_OP_N,  // No transposition for both matrices
+                                 numColsB, numRowsA, numColsA,  // Dimensions of matrices
+                                 &alpha,  // Scalar alpha
+                                 deviceMatrixB, numColsB,  // Matrix B in device memory
+                                 deviceMatrixA, numColsA,  // Matrix A in device memory
+                                 &beta,  // Scalar beta
+                                 deviceResultMatrix, numColsB));  // Result matrix C in device memory
+    }
+    // For double: Use cublasDgemm (double precision)
+    else if constexpr (std::is_same_v<T, double>) {
         CUBLAS_CHECK(cublasDgemm(cublasHandle,
-                                 CUBLAS_OP_N, CUBLAS_OP_N,
-                                 numColsB, numRowsA, numColsA,
-                                 &alpha,
-                                 deviceMatrixB, numColsB,
-                                 deviceMatrixA, numColsA,
-                                 &beta,
-                                 deviceResultMatrix, numColsB));
-    } else {
+                                 CUBLAS_OP_N, CUBLAS_OP_N,  // No transposition for both matrices
+                                 numColsB, numRowsA, numColsA,  // Dimensions of matrices
+                                 &alpha,  // Scalar alpha
+                                 deviceMatrixB, numColsB,  // Matrix B in device memory
+                                 deviceMatrixA, numColsA,  // Matrix A in device memory
+                                 &beta,  // Scalar beta
+                                 deviceResultMatrix, numColsB));  // Result matrix C in device memory
+    }
+    // If neither float nor double, throw a compile-time error
+    else {
         static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
-                      "Only float and double types are supported");
+                      "Only float and double types are supported for matrix multiplication");
     }
 
+    // Copy the result matrix from the device (GPU) back to the host (CPU)
     CUDA_CHECK(cudaMemcpy(hostResultMatrix, deviceResultMatrix, byteSizeC, cudaMemcpyDeviceToHost));
 
-    CUBLAS_CHECK(cublasDestroy(cublasHandle));
-    CUDA_CHECK(cudaFree(deviceMatrixA));
-    CUDA_CHECK(cudaFree(deviceMatrixB));
-    CUDA_CHECK(cudaFree(deviceResultMatrix));
+    // Clean up: Destroy cuBLAS handle and free the allocated GPU memory
+    CUBLAS_CHECK(cublasDestroy(cublasHandle));  // Destroy cuBLAS context
+    CUDA_CHECK(cudaFree(deviceMatrixA));  // Free memory for matrix A
+    CUDA_CHECK(cudaFree(deviceMatrixB));  // Free memory for matrix B
+    CUDA_CHECK(cudaFree(deviceResultMatrix));  // Free memory for result matrix C
 }
 
-// Explicit instantiations
+// Explicit template instantiations for float and double types
 template void multiplyMatricesOnGPU<float>(const float*, const float*, float*, int, int, int);
 template void multiplyMatricesOnGPU<double>(const double*, const double*, double*, int, int, int);