EE423-EmbeddedSystems/Lab06-GPUprogramming/multSq.cu at master · Uthmanhere/EE423-EmbeddedSystems · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#include <stdio.h>


#define SIZE 64  // 64-by-64 square matrix

__global__ void matMult(int * matProd, int * matA, int * matB)
{
        int row = blockIdx.x;
        int col = threadIdx.x;

        int tmpSum = 0;;
        if (row < SIZE && col < SIZE)
        {
                for (int i=0; i<SIZE; ++i)
                        tmpSum += matA[row*SIZE + i] * matB[i*SIZE + col];
                matProd[row*SIZE + col] = tmpSum;
        }
}

int main()
{
        // initialize, aalocate and define host memory
        int matA[SIZE*SIZE] = { 0 };
        int matB[SIZE*SIZE] = { 0 };
        int matProd[SIZE*SIZE] = { 0 };
        for (int i=0; i<SIZE; ++i)
        {
                for (int j=0; j<SIZE; ++j)
                {
                        matA[i*SIZE + j] = i+j;
                        matB[i*SIZE + j] = i-j;
                }
        }

        // initialize and allocate device memory
        int * dev_matProd, * dev_matA, * dev_matB;
        cudaMalloc((void **)&dev_matA, SIZE*SIZE*sizeof(int));
        cudaMalloc((void **)&dev_matB, SIZE*SIZE*sizeof(int));
        cudaMalloc((void **)&dev_matProd, SIZE*SIZE*sizeof(int));


        // copy data to device memory
        cudaMemcpy((void *)dev_matA, (void *)matA, SIZE*SIZE*sizeof(int),
                        cudaMemcpyHostToDevice);
        cudaMemcpy((void *)dev_matB, (void *)matB, SIZE*SIZE*sizeof(int),
                        cudaMemcpyHostToDevice);

        matMult<<<SIZE,SIZE>>>(dev_matProd, dev_matA, dev_matB);

        // check for successful thread execution
        if (cudaDeviceSynchronize() != cudaSuccess)
        {
                printf("Error\n");
                return -1;
        }

        // copy results from device to host memory
        cudaMemcpy(matProd, dev_matProd, SIZE*SIZE*sizeof(int),
                        cudaMemcpyDeviceToHost);


        for (int i=0; i<SIZE/2; ++i)  // inspecting first few diagnols
                printf(" > Diagonal %d of prudect is %d.\n",
                                i, matProd[i*SIZE+i]);

        return 0;
}