Skip to content

Commit 116437c

Browse files
committed
style(cuda): add more comments
1 parent b41da2c commit 116437c

File tree

2 files changed

+90
-45
lines changed

2 files changed

+90
-45
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,82 @@
1-
#include "cuda_utils.h"
2-
#include "matrix_add.h"
1+
#include "cuda_utils.h" // Custom CUDA utilities for error checking, etc.
2+
#include "matrix_add.h" // Header file for this matrix addition module
33

4+
// Namespace to encapsulate CUDA kernel functions
45
namespace cuda_kernel {
56

7+
// CUDA Kernel: Adds two matrices element-wise on the GPU
8+
// Each thread computes a single element of the result matrix
9+
// Parameters:
10+
// - matrixA: Device pointer to the input matrix A
11+
// - matrixB: Device pointer to the input matrix B
12+
// - resultMatrix: Device pointer to the output result matrix
13+
// - numRows: Number of rows in the matrices
14+
// - numCols: Number of columns in the matrices
615
template <typename T>
716
__global__ void addMatricesKernel(const T* matrixA, const T* matrixB, T* resultMatrix, int numRows, int numCols) {
17+
// Calculate the row and column indices for this thread
818
int row = blockIdx.y * blockDim.y + threadIdx.y;
919
int col = blockIdx.x * blockDim.x + threadIdx.x;
1020

21+
// Ensure the thread is within valid matrix bounds
1122
if (row < numRows && col < numCols) {
1223
int index = row * numCols + col;
13-
resultMatrix[index] = matrixA[index] + matrixB[index];
24+
resultMatrix[index] = matrixA[index] + matrixB[index]; // Perform element-wise addition
1425
}
1526
}
1627

1728
} // namespace cuda_kernel
1829

30+
// C++ Function: Handles matrix addition on the GPU
31+
// Transfers matrices from the host (CPU) to the device (GPU), performs the computation,
32+
// and then copies the result back to the host.
33+
// Parameters:
34+
// - hostMatrixA: Pointer to matrix A on the host (CPU)
35+
// - hostMatrixB: Pointer to matrix B on the host (CPU)
36+
// - hostResultMatrix: Pointer to the result matrix on the host (CPU)
37+
// - numRows: Number of rows in the matrices
38+
// - numCols: Number of columns in the matrices
1939
template <typename T>
2040
void addMatricesOnGPU(const T* hostMatrixA, const T* hostMatrixB, T* hostResultMatrix, int numRows, int numCols) {
41+
// Calculate the size of the matrices in bytes
2142
size_t matrixSizeBytes = numRows * numCols * sizeof(T);
2243

44+
// Device (GPU) memory pointers
2345
T *deviceMatrixA, *deviceMatrixB, *deviceResultMatrix;
2446

25-
CUDA_CHECK(cudaMalloc(&deviceMatrixA, matrixSizeBytes));
26-
CUDA_CHECK(cudaMalloc(&deviceMatrixB, matrixSizeBytes));
27-
CUDA_CHECK(cudaMalloc(&deviceResultMatrix, matrixSizeBytes));
47+
// Allocate memory on the device (GPU)
48+
CUDA_CHECK(cudaMalloc(&deviceMatrixA, matrixSizeBytes)); // Allocate memory for matrix A
49+
CUDA_CHECK(cudaMalloc(&deviceMatrixB, matrixSizeBytes)); // Allocate memory for matrix B
50+
CUDA_CHECK(cudaMalloc(&deviceResultMatrix, matrixSizeBytes)); // Allocate memory for the result matrix
2851

29-
// Copy input matrices from host to device
52+
// Copy input matrices from host (CPU) to device (GPU)
3053
CUDA_CHECK(cudaMemcpy(deviceMatrixA, hostMatrixA, matrixSizeBytes, cudaMemcpyHostToDevice));
3154
CUDA_CHECK(cudaMemcpy(deviceMatrixB, hostMatrixB, matrixSizeBytes, cudaMemcpyHostToDevice));
3255

33-
// Define the grid and block dimensions
34-
dim3 threadsPerBlock(16, 16);
56+
// Define grid and block dimensions for launching the kernel
57+
dim3 threadsPerBlock(16, 16); // Each block contains 16x16 threads
3558
dim3 numBlocks((numCols + threadsPerBlock.x - 1) / threadsPerBlock.x,
36-
(numRows + threadsPerBlock.y - 1) / threadsPerBlock.y);
59+
(numRows + threadsPerBlock.y - 1) / threadsPerBlock.y); // Calculate number of blocks required
3760

61+
// Launch the CUDA kernel to add the matrices on the device
3862
cuda_kernel::addMatricesKernel<<<numBlocks, threadsPerBlock>>>(
3963
deviceMatrixA, deviceMatrixB, deviceResultMatrix, numRows, numCols);
4064

41-
// Check for errors
65+
// Check for kernel launch errors
4266
CUDA_CHECK(cudaGetLastError());
4367

44-
// Wait for GPU to finish
68+
// Synchronize the device to ensure kernel execution is complete
4569
CUDA_CHECK(cudaDeviceSynchronize());
4670

47-
// Copy the result back to host memory
71+
// Copy the result matrix from device (GPU) back to host (CPU)
4872
CUDA_CHECK(cudaMemcpy(hostResultMatrix, deviceResultMatrix, matrixSizeBytes, cudaMemcpyDeviceToHost));
4973

50-
// Free GPU memory
74+
// Free the allocated memory on the device
5175
CUDA_CHECK(cudaFree(deviceMatrixA));
5276
CUDA_CHECK(cudaFree(deviceMatrixB));
5377
CUDA_CHECK(cudaFree(deviceResultMatrix));
5478
}
5579

56-
// Explicit instantiations
80+
// Explicit template instantiations for float and double types
5781
template void addMatricesOnGPU<float>(const float*, const float*, float*, int, int);
5882
template void addMatricesOnGPU<double>(const double*, const double*, double*, int, int);
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,83 @@
1-
#include "cuda_utils.h"
2-
#include "matrix_mult.h"
1+
#include "cuda_utils.h" // Custom CUDA utility functions and macros for error checking
2+
#include "matrix_mult.h" // Header for this matrix multiplication module
33

4+
// Function to perform matrix multiplication on the GPU using cuBLAS
5+
// This function transfers the input matrices from the host (CPU) to the device (GPU),
6+
// executes the matrix multiplication on the GPU, and retrieves the result back to the host.
7+
// Parameters:
8+
// - hostMatrixA: Pointer to the first matrix (A) on the host (CPU)
9+
// - hostMatrixB: Pointer to the second matrix (B) on the host (CPU)
10+
// - hostResultMatrix: Pointer to the result matrix (C) on the host (CPU)
11+
// - numRowsA: Number of rows in matrix A
12+
// - numColsA: Number of columns in matrix A (and rows in matrix B)
13+
// - numColsB: Number of columns in matrix B
414
template <typename T>
515
void multiplyMatricesOnGPU(const T* hostMatrixA, const T* hostMatrixB, T* hostResultMatrix,
616
int numRowsA, int numColsA, int numColsB) {
17+
// Calculate the size of matrices A, B, and C in bytes
718
size_t byteSizeA = numRowsA * numColsA * sizeof(T);
819
size_t byteSizeB = numColsA * numColsB * sizeof(T);
920
size_t byteSizeC = numRowsA * numColsB * sizeof(T);
1021

22+
// Device (GPU) memory pointers for matrices A, B, and result matrix C
1123
T *deviceMatrixA, *deviceMatrixB, *deviceResultMatrix;
1224

13-
// Allocate memory on the GPU
14-
CUDA_CHECK(cudaMalloc(&deviceMatrixA, byteSizeA));
15-
CUDA_CHECK(cudaMalloc(&deviceMatrixB, byteSizeB));
16-
CUDA_CHECK(cudaMalloc(&deviceResultMatrix, byteSizeC));
25+
// Allocate memory for matrices on the GPU
26+
CUDA_CHECK(cudaMalloc(&deviceMatrixA, byteSizeA)); // Allocate memory for matrix A on the GPU
27+
CUDA_CHECK(cudaMalloc(&deviceMatrixB, byteSizeB)); // Allocate memory for matrix B on the GPU
28+
CUDA_CHECK(cudaMalloc(&deviceResultMatrix, byteSizeC)); // Allocate memory for result matrix C on the GPU
1729

18-
// Copy input matrices from host to device
30+
// Copy matrices A and B from the host (CPU) to the device (GPU)
1931
CUDA_CHECK(cudaMemcpy(deviceMatrixA, hostMatrixA, byteSizeA, cudaMemcpyHostToDevice));
2032
CUDA_CHECK(cudaMemcpy(deviceMatrixB, hostMatrixB, byteSizeB, cudaMemcpyHostToDevice));
2133

34+
// Create a cuBLAS handle for matrix multiplication
2235
cublasHandle_t cublasHandle;
2336
CUBLAS_CHECK(cublasCreate(&cublasHandle));
2437

38+
// Define alpha and beta scalars for the matrix multiplication: C = alpha * A * B + beta * C
2539
const T alpha = 1.0;
2640
const T beta = 0.0;
2741

28-
// Perform matrix multiplication using cuBLAS
42+
// Perform matrix multiplication using cuBLAS based on the type of T (float or double)
43+
// For float: Use cublasSgemm (single precision)
2944
if constexpr (std::is_same_v<T, float>) {
3045
CUBLAS_CHECK(cublasSgemm(cublasHandle,
31-
CUBLAS_OP_N, CUBLAS_OP_N,
32-
numColsB, numRowsA, numColsA,
33-
&alpha,
34-
deviceMatrixB, numColsB,
35-
deviceMatrixA, numColsA,
36-
&beta,
37-
deviceResultMatrix, numColsB));
38-
} else if constexpr (std::is_same_v<T, double>) {
46+
CUBLAS_OP_N, CUBLAS_OP_N, // No transposition for both matrices
47+
numColsB, numRowsA, numColsA, // Dimensions of matrices
48+
&alpha, // Scalar alpha
49+
deviceMatrixB, numColsB, // Matrix B in device memory
50+
deviceMatrixA, numColsA, // Matrix A in device memory
51+
&beta, // Scalar beta
52+
deviceResultMatrix, numColsB)); // Result matrix C in device memory
53+
}
54+
// For double: Use cublasDgemm (double precision)
55+
else if constexpr (std::is_same_v<T, double>) {
3956
CUBLAS_CHECK(cublasDgemm(cublasHandle,
40-
CUBLAS_OP_N, CUBLAS_OP_N,
41-
numColsB, numRowsA, numColsA,
42-
&alpha,
43-
deviceMatrixB, numColsB,
44-
deviceMatrixA, numColsA,
45-
&beta,
46-
deviceResultMatrix, numColsB));
47-
} else {
57+
CUBLAS_OP_N, CUBLAS_OP_N, // No transposition for both matrices
58+
numColsB, numRowsA, numColsA, // Dimensions of matrices
59+
&alpha, // Scalar alpha
60+
deviceMatrixB, numColsB, // Matrix B in device memory
61+
deviceMatrixA, numColsA, // Matrix A in device memory
62+
&beta, // Scalar beta
63+
deviceResultMatrix, numColsB)); // Result matrix C in device memory
64+
}
65+
// If neither float nor double, throw a compile-time error
66+
else {
4867
static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
49-
"Only float and double types are supported");
68+
"Only float and double types are supported for matrix multiplication");
5069
}
5170

71+
// Copy the result matrix from the device (GPU) back to the host (CPU)
5272
CUDA_CHECK(cudaMemcpy(hostResultMatrix, deviceResultMatrix, byteSizeC, cudaMemcpyDeviceToHost));
5373

54-
CUBLAS_CHECK(cublasDestroy(cublasHandle));
55-
CUDA_CHECK(cudaFree(deviceMatrixA));
56-
CUDA_CHECK(cudaFree(deviceMatrixB));
57-
CUDA_CHECK(cudaFree(deviceResultMatrix));
74+
// Clean up: Destroy cuBLAS handle and free the allocated GPU memory
75+
CUBLAS_CHECK(cublasDestroy(cublasHandle)); // Destroy cuBLAS context
76+
CUDA_CHECK(cudaFree(deviceMatrixA)); // Free memory for matrix A
77+
CUDA_CHECK(cudaFree(deviceMatrixB)); // Free memory for matrix B
78+
CUDA_CHECK(cudaFree(deviceResultMatrix)); // Free memory for result matrix C
5879
}
5980

60-
// Explicit instantiations
81+
// Explicit template instantiations for float and double types
6182
template void multiplyMatricesOnGPU<float>(const float*, const float*, float*, int, int, int);
6283
template void multiplyMatricesOnGPU<double>(const double*, const double*, double*, int, int, int);

0 commit comments

Comments
 (0)